1use std::cell::OnceCell;
200use std::collections::hash_map::IntoIter;
201use std::collections::{HashMap, HashSet};
202use std::fs::File;
203use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
204use std::path::{Path, PathBuf};
205use std::sync::Arc;
206use std::time::SystemTime;
207
208use file_declutter::FileDeclutter;
209use rayon::prelude::*;
210use serde::{Deserialize, Serialize};
211use thiserror::Error;
212use walkdir::WalkDir;
213
214mod cache;
215
216#[derive(Debug, Error)]
217pub enum Error {
218 #[error(transparent)]
219 Io(#[from] std::io::Error),
220}
221
222type Result<R> = std::result::Result<R, Error>;
223
224#[cfg(unix)]
225fn read_at_chunk(file: &File, offset: u64, len: usize) -> std::io::Result<Vec<u8>> {
226 use std::os::unix::fs::FileExt;
227 let mut buf = vec![0u8; len];
228 let mut pos = 0usize;
229 while pos < len {
230 let read = file.read_at(&mut buf[pos..], offset + pos as u64)?;
231 if read == 0 {
232 break;
233 }
234 pos += read;
235 }
236 buf.truncate(pos);
237 Ok(buf)
238}
239
240#[cfg(windows)]
241fn read_at_chunk(file: &File, offset: u64, len: usize) -> std::io::Result<Vec<u8>> {
242 use std::os::windows::fs::FileExt;
243 let handle = file.try_clone()?;
245 let mut buf = vec![0u8; len];
246 let mut pos = 0usize;
247 while pos < len {
248 let read = handle.seek_read(&mut buf[pos..], offset + pos as u64)?;
249 if read == 0 {
250 break;
251 }
252 pos += read;
253 }
254 buf.truncate(pos);
255 Ok(buf)
256}
257
258#[derive(Clone, Copy, Debug, Default, Deserialize, Serialize)]
260pub enum HashingAlgorithm {
261 MD5,
262 #[default]
263 SHA1,
264 SHA256,
265 SHA512,
266}
267
268impl HashingAlgorithm {
269 fn select_hasher(&self) -> Box<dyn sha2::digest::DynDigest> {
271 match self {
272 Self::MD5 => Box::new(md5::Md5::default()),
273 Self::SHA1 => Box::new(sha1::Sha1::default()),
274 Self::SHA256 => Box::new(sha2::Sha256::default()),
275 Self::SHA512 => Box::new(sha2::Sha512::default()),
276 }
277 }
278}
279
280#[derive(Clone, Debug)]
282pub struct FileWithChunks {
283 base: PathBuf,
284 pub path: String,
286 pub size: u64,
288 pub mtime: SystemTime,
290 chunks: OnceCell<Vec<FileChunk>>,
291 hashing_algorithm: HashingAlgorithm,
292}
293
294impl PartialEq for FileWithChunks {
295 fn eq(&self, other: &Self) -> bool {
296 self.path == other.path && self.size == other.size && self.mtime == other.mtime
297 }
298}
299
300impl Eq for FileWithChunks {}
301
302impl FileWithChunks {
303 pub fn try_new(
305 source_path: impl Into<PathBuf>,
306 path: impl Into<PathBuf>,
307 hashing_algorithm: HashingAlgorithm,
308 ) -> Result<Self> {
309 let base = source_path.into();
310
311 let path = path.into();
312 let metadata = path.metadata()?;
313
314 let path = path
315 .strip_prefix(&base)
316 .unwrap()
317 .to_string_lossy()
318 .to_string();
319 let size = metadata.len();
320 let mtime = metadata.modified()?;
321
322 Ok(Self {
323 base,
324 path,
325 size,
326 mtime,
327 chunks: Default::default(),
328 hashing_algorithm,
329 })
330 }
331
332 pub fn get_chunks(&self) -> Option<&Vec<FileChunk>> {
334 self.chunks.get()
335 }
336
337 pub fn take_chunks(&mut self) -> Option<Vec<FileChunk>> {
339 self.chunks.take()
340 }
341
342 pub fn get_or_calculate_chunks(&self) -> Result<&Vec<FileChunk>> {
344 if self.chunks.get().is_none() {
345 let chunks = self.calculate_chunks()?;
346
347 self.chunks.set(chunks).unwrap();
349 }
350
351 Ok(self.chunks.get().unwrap())
352 }
353
354 fn calculate_chunks(&self) -> Result<Vec<FileChunk>> {
355 let path = self.base.join(&self.path);
356
357 let size = path.metadata()?.len();
358
359 let hashing_algorithm = self.hashing_algorithm;
360
361 let chunk_size = 1024 * 1024;
363 if size == 0 {
364 let hasher = hashing_algorithm.select_hasher();
365 let hash = hasher.finalize();
366 let hash = base16ct::lower::encode_string(&hash);
367
368 std::iter::once(Ok::<FileChunk, Error>(FileChunk::new(0, 0, hash))).collect()
369 } else {
370 let file = Arc::new(File::open(&path)?);
372 let total_chunks = (size + chunk_size - 1) / chunk_size;
373
374 (0..total_chunks)
375 .into_par_iter()
376 .map(|chunk_idx| {
377 let offset = chunk_idx * chunk_size;
378 let len = chunk_size.min(size.saturating_sub(offset)) as usize;
379
380 let data = read_at_chunk(&file, offset, len)?;
381
382 let mut hasher = hashing_algorithm.select_hasher();
383 hasher.update(&data);
384 let hash = hasher.finalize();
385 let hash = base16ct::lower::encode_string(&hash);
386
387 Ok::<FileChunk, Error>(FileChunk::new(offset, data.len() as u64, hash))
388 })
389 .collect()
390 }
391 }
392}
393
394#[derive(Clone, Debug)]
396pub struct FileChunk {
397 pub start: u64,
398 pub size: u64,
399 pub hash: String,
400 pub path: Option<String>,
401}
402
403impl FileChunk {
404 pub fn new(start: u64, size: u64, hash: String) -> Self {
406 Self {
407 start,
408 size,
409 hash,
410 path: None,
411 }
412 }
413}
414
415pub struct DedupCache(HashMap<String, FileWithChunks>);
417
418impl DedupCache {
419 fn new() -> Self {
421 Self(HashMap::new())
422 }
423
424 fn from_hashmap(hash_map: HashMap<String, FileWithChunks>) -> Self {
426 Self(hash_map)
427 }
428
429 fn read_from_file(&mut self, path: impl AsRef<Path>) {
431 let cache_from_file = cache::read_from_file(path);
432
433 for x in cache_from_file {
434 self.insert(x.path.clone(), x);
435 }
436 }
437
438 fn write_to_file(&self, path: impl AsRef<Path>) {
440 cache::write_to_file(path, self);
441 }
442
443 pub fn get_chunks(&self) -> Result<impl Iterator<Item = (String, FileChunk, bool)> + '_> {
446 Ok(self.values().flat_map(|fwc| {
447 let mut dirty = fwc.get_chunks().is_none();
448
449 fwc.get_or_calculate_chunks()
450 .unwrap()
451 .iter()
452 .map(move |chunk| {
453 let result = (
454 chunk.hash.clone(),
455 FileChunk {
456 path: Some(fwc.path.clone()),
457 ..chunk.clone()
458 },
459 dirty,
460 );
461
462 dirty = false;
463
464 result
465 })
466 }))
467 }
468
469 pub fn get(&self, path: &str) -> Option<&FileWithChunks> {
470 self.0.get(path)
471 }
472
473 pub fn get_mut(&mut self, path: &str) -> Option<&mut FileWithChunks> {
474 self.0.get_mut(path)
475 }
476
477 fn insert(&mut self, path: String, fwc: FileWithChunks) {
478 self.0.insert(path, fwc);
479 }
480
481 pub fn contains_key(&self, path: &str) -> bool {
482 self.0.contains_key(path)
483 }
484
485 pub fn into_iter(self) -> IntoIter<String, FileWithChunks> {
486 self.0.into_iter()
487 }
488
489 pub fn values(&self) -> impl Iterator<Item = &FileWithChunks> {
490 self.0.values()
491 }
492
493 pub fn len(&self) -> usize {
494 self.0.len()
495 }
496}
497
498pub struct Deduper {
501 source_path: PathBuf,
502 cache_path: PathBuf,
503 pub cache: DedupCache,
504}
505
506impl Deduper {
507 pub fn new(
512 source_path: impl Into<PathBuf>,
513 cache_paths: Vec<impl Into<PathBuf>>,
514 hashing_algorithm: HashingAlgorithm,
515 same_file_system: bool,
516 ) -> Self {
517 let source_path = source_path.into();
518
519 let mut cache = DedupCache::new();
520
521 let cache_path = {
522 let mut cache_path = Default::default();
523 for cache_path_from_iter in cache_paths.into_iter().rev() {
524 cache_path = cache_path_from_iter.into();
525 cache.read_from_file(&cache_path);
526 }
527 cache_path
528 };
529
530 let valid_entry = |path: &PathBuf| path.is_file() && !path.is_symlink();
531
532 cache = DedupCache::from_hashmap(
533 cache
534 .into_iter()
535 .filter(|(path, _)| valid_entry(&source_path.join(path)))
536 .collect(),
537 );
538
539 let dir_walker = WalkDir::new(&source_path)
540 .min_depth(1)
541 .same_file_system(same_file_system);
542
543 for entry in dir_walker {
544 let entry = entry.unwrap().into_path();
545
546 if !valid_entry(&entry) {
547 continue;
548 }
549
550 let fwc = FileWithChunks::try_new(&source_path, &entry, hashing_algorithm).unwrap();
551
552 if let Some(fwc_cache) = cache.get_mut(&fwc.path) {
553 if fwc == *fwc_cache {
554 fwc_cache.base = source_path.clone();
555 continue;
556 }
557 }
558
559 cache.insert(fwc.path.clone(), fwc);
560 }
561
562 Self {
563 source_path,
564 cache_path,
565 cache,
566 }
567 }
568
569 pub fn write_cache(&self) {
571 if self.cache_path.file_name().is_none() {
572 return;
573 }
574
575 let temp_path = self.cache_path.clone().with_extension(format!(
576 "tmp.{}.{}",
577 SystemTime::now()
578 .duration_since(SystemTime::UNIX_EPOCH)
579 .unwrap()
580 .as_millis(),
581 self.cache_path
582 .extension()
583 .unwrap_or("ext".as_ref())
584 .to_str()
585 .unwrap()
586 ));
587 self.cache.write_to_file(&temp_path);
588 std::fs::rename(temp_path, &self.cache_path).unwrap();
589 }
590
591 pub fn write_chunks(
594 &mut self,
595 target_path: impl Into<PathBuf>,
596 declutter_levels: usize,
597 ) -> Result<()> {
598 let target_path = target_path.into();
599 let data_dir = target_path.join("data");
600 std::fs::create_dir_all(&data_dir)?;
601 for (_, chunk, _) in self.cache.get_chunks()? {
602 let mut chunk_file = PathBuf::from(&chunk.hash);
603 if declutter_levels > 0 {
604 chunk_file = FileDeclutter::oneshot(chunk_file, declutter_levels);
605 }
606 chunk_file = data_dir.join(chunk_file);
607
608 if !chunk_file.exists() {
609 std::fs::create_dir_all(&chunk_file.parent().unwrap())?;
610 let mut out = File::create(chunk_file)?;
611 let mut src = BufReader::new(File::open(
612 self.source_path.join(chunk.path.as_ref().unwrap()),
613 )?);
614 src.seek(SeekFrom::Start(chunk.start))?;
615 let mut limited = src.take(chunk.size);
616 std::io::copy(&mut limited, &mut out)?;
617 }
618 }
619
620 Ok(())
621 }
622}
623
624pub struct Hydrator {
626 source_path: PathBuf,
627 pub cache: DedupCache,
628}
629
630impl Hydrator {
631 pub fn new(source_path: impl Into<PathBuf>, cache_paths: Vec<impl Into<PathBuf>>) -> Self {
633 let source_path = source_path.into();
634
635 let mut cache = DedupCache::new();
636
637 for cache_path in cache_paths.into_iter().rev() {
638 let cache_path = cache_path.into();
639 cache.read_from_file(&cache_path);
640 }
641
642 Self { source_path, cache }
643 }
644
645 pub fn restore_files(&self, target_path: impl Into<PathBuf>, declutter_levels: usize) {
648 let data_dir = self.source_path.join("data");
649 let target_path = target_path.into();
650 std::fs::create_dir_all(&target_path).unwrap();
651 for fwc in self.cache.values() {
652 let target = target_path.join(&fwc.path);
653 std::fs::create_dir_all(&target.parent().unwrap()).unwrap();
654 let target_file = File::create(&target).unwrap();
655 let mut target = BufWriter::new(&target_file);
656 for chunk in fwc.get_chunks().unwrap() {
657 let mut chunk_file = PathBuf::from(&chunk.hash);
658 if declutter_levels > 0 {
659 chunk_file = FileDeclutter::oneshot(chunk_file, declutter_levels);
660 }
661 chunk_file = data_dir.join(chunk_file);
662
663 let mut source = File::open(chunk_file).unwrap();
664 std::io::copy(&mut source, &mut target).unwrap();
665 }
666 target.flush().unwrap();
667 target_file.set_modified(fwc.mtime).unwrap()
668 }
669 }
670
671 pub fn list_missing_chunks(
673 &self,
674 declutter_levels: usize,
675 ) -> impl Iterator<Item = (PathBuf, String)> {
676 let mut hashes_and_chunks = self
677 .cache
678 .get_chunks()
679 .unwrap()
680 .map(|(hash, chunk, ..)| (PathBuf::from(hash), chunk))
681 .collect::<Vec<_>>();
682 hashes_and_chunks.sort_by(|a, b| a.0.cmp(&b.0));
683 hashes_and_chunks.dedup_by(|a, b| a.0 == b.0);
684
685 let (hashes, chunks): (Vec<_>, Vec<_>) = hashes_and_chunks.into_iter().unzip();
686
687 let files_in_cache = FileDeclutter::new_from_iter(hashes.into_iter())
688 .base(&self.source_path.join("data"))
689 .levels(declutter_levels)
690 .map(|(_, path)| path);
691
692 files_in_cache
693 .zip(chunks)
694 .into_iter()
695 .filter_map(|(path, chunk)| {
696 if !path.exists() {
697 Some((path, "Does not exist".to_string()))
698 } else if path.metadata().unwrap().len() != chunk.size {
699 Some((
700 path,
701 format!("Does not have expected size of {}", chunk.size),
702 ))
703 } else {
704 None
705 }
706 })
707 }
708
709 pub fn check_cache(&self, declutter_levels: usize) -> bool {
711 self.list_missing_chunks(declutter_levels).next().is_none()
712 }
713
714 pub fn list_extra_files(&self, declutter_levels: usize) -> impl Iterator<Item = PathBuf> {
716 let files_in_cache = FileDeclutter::new_from_iter(
717 self.cache
718 .get_chunks()
719 .unwrap()
720 .map(|(hash, ..)| PathBuf::from(hash)),
721 )
722 .base(&self.source_path.join("data"))
723 .levels(declutter_levels)
724 .map(|(_, path)| path)
725 .collect::<HashSet<_>>();
726
727 WalkDir::new(&self.source_path.join("data"))
728 .min_depth(1)
729 .same_file_system(false)
730 .into_iter()
731 .filter(move |entry| {
732 entry
733 .as_ref()
734 .map(|entry| {
735 entry.file_type().is_file() && !files_in_cache.contains(entry.path())
736 })
737 .unwrap_or_default()
738 })
739 .flatten()
740 .map(|entry| entry.into_path())
741 }
742
743 pub fn delete_extra_files(&self, declutter_levels: usize) -> anyhow::Result<()> {
745 for path in self.list_extra_files(declutter_levels) {
746 std::fs::remove_file(&path)?;
747 }
748
749 Ok(())
750 }
751}
752
753#[cfg(test)]
754mod tests {
755 use std::fs::OpenOptions;
756
757 use assert_fs::fixture::ChildPath;
758 use assert_fs::prelude::*;
759 use assert_fs::{NamedTempFile, TempDir};
760
761 use super::*;
762
763 fn setup() -> anyhow::Result<(TempDir, ChildPath, ChildPath, ChildPath)> {
764 let temp = TempDir::new()?;
765
766 let origin = temp.child("origin");
767 origin.create_dir_all()?;
768 origin.child("README.md").write_str("Hello, world!")?;
769
770 let deduped = temp.child("deduped");
771 deduped.create_dir_all()?;
772
773 let cache = temp.child("cache.json");
774
775 {
776 let mut deduper = Deduper::new(
777 origin.to_path_buf(),
778 vec![cache.to_path_buf()],
779 HashingAlgorithm::MD5,
780 true,
781 );
782 deduper.write_chunks(deduped.to_path_buf(), 3)?;
783 deduper.write_cache();
784 }
785
786 Ok((temp, origin, deduped, cache))
787 }
788
789 #[test]
790 fn compare_filechunk_objects() -> anyhow::Result<()> {
791 let temp = TempDir::new()?;
792
793 let file_1 = temp.child("file_1");
794 std::fs::write(&file_1, "content_1")?;
795
796 let file_2 = temp.child("file_2");
797 std::fs::write(&file_2, "content_2")?;
798
799 let fwc_1 = FileWithChunks::try_new(&temp.path(), &file_1.path(), HashingAlgorithm::MD5)?;
800 let fwc_1_same =
801 FileWithChunks::try_new(&temp.path(), &file_1.path(), HashingAlgorithm::MD5)?;
802 let fwc_2 = FileWithChunks::try_new(&temp.path(), &file_2.path(), HashingAlgorithm::MD5)?;
803
804 assert_eq!(fwc_1, fwc_1);
805 assert_eq!(fwc_1, fwc_1_same);
806 assert_ne!(fwc_1, fwc_2);
807
808 OpenOptions::new()
811 .write(true)
812 .open(&file_1)?
813 .set_modified(SystemTime::now())?;
814
815 let fwc_1_new =
816 FileWithChunks::try_new(&temp.path(), &file_1.path(), HashingAlgorithm::MD5)?;
817
818 assert_ne!(fwc_1, fwc_1_new);
819
820 Ok(())
821 }
822
823 #[test]
824 fn check_all_hashing_algorithms() -> anyhow::Result<()> {
825 let algorithms = &[
826 (HashingAlgorithm::MD5, "0fb073cd346f46f60c15e719f3820482"),
827 (
828 HashingAlgorithm::SHA1,
829 "5503f5edc1bba66a7733c5ec38f4e9d449021be9",
830 ),
831 (
832 HashingAlgorithm::SHA256,
833 "e8c73ac958a87f17906b092bd99f37038788ee23b271574aad6d5bf1c76cc61c",
834 ),
835 (
836 HashingAlgorithm::SHA512,
837 "e6eda213df25f96ca380dd07640df530574e380c1b93d5d863fec05d5908a4880a3075fef4a438cfb1023cc51affb4624002f54b4790fe8362c7de032eb39aaa",
838 ),
839 ];
840
841 let temp = TempDir::new()?;
842 let file = temp.child("file");
843 std::fs::write(&file, "hello rust")?;
844
845 for (algorithm, expected_hash) in algorithms.iter().copied() {
846 let cache_file = NamedTempFile::new("cache.json")?;
847
848 let chunks = Deduper::new(temp.path(), vec![cache_file.path()], algorithm, true)
849 .cache
850 .get_chunks()?
851 .collect::<Vec<_>>();
852
853 assert_eq!(chunks.len(), 1, "Too many chunks");
854
855 let (hash, _, _) = &chunks[0];
856 assert_eq!(
857 hash, &expected_hash,
858 "Algorithm {:?} does not produce expected hash",
859 algorithm
860 );
861 }
862
863 Ok(())
864 }
865
866 #[test]
867 fn check_cache() -> anyhow::Result<()> {
868 let (_temp, _origin, deduped, cache) = setup()?;
869
870 assert!(
871 Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()]).check_cache(3),
872 "Cache checking failed when it shouldn't"
873 );
874
875 std::fs::remove_dir_all(deduped.child("data").read_dir()?.next().unwrap()?.path())?;
876
877 assert!(
878 !Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()]).check_cache(3),
879 "Cache checking didn't fail when it should"
880 );
881
882 Ok(())
883 }
884
885 #[test]
886 fn check_list_extra() -> anyhow::Result<()> {
887 let (_temp, _origin, deduped, cache) = setup()?;
888
889 assert_eq!(
890 Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()])
891 .list_extra_files(3)
892 .count(),
893 0,
894 "Extra files present when there shouldn't be"
895 );
896
897 deduped
898 .child("data")
899 .child("extra_file")
900 .write_str("Hello, world!")?;
901
902 assert_eq!(
903 Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()])
904 .list_extra_files(3)
905 .count(),
906 1,
907 "Number of extra files present is not 1"
908 );
909
910 deduped
911 .child("data")
912 .child("e")
913 .child("x")
914 .child("t")
915 .child("extra_file")
916 .write_str("Hello, world!")?;
917
918 assert_eq!(
919 Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()])
920 .list_extra_files(3)
921 .count(),
922 2,
923 "Number of extra files present is not 2"
924 );
925
926 Ok(())
927 }
928
929 #[cfg(not(windows))]
930 #[test]
931 fn check_files_with_exotic_characters() -> anyhow::Result<()> {
932 let temp = TempDir::new()?;
933
934 let origin = temp.child("origin");
935 origin.create_dir_all()?;
936
937 let deduped = temp.child("deduped");
938 deduped.create_dir_all()?;
939
940 let cache = temp.child("cache.json");
941
942 let filename_with_newline = "new\nline.txt";
943 let file_with_newline = origin.child(filename_with_newline);
944 file_with_newline.write_str("content")?;
945
946 let filename_with_japanese = "日本語ファイル名";
947 let file_with_japanese = origin.child(filename_with_japanese);
948 file_with_japanese.write_str("content")?;
949
950 let try_dedup = || -> anyhow::Result<()> {
951 let mut deduper = Deduper::new(
952 origin.to_path_buf(),
953 vec![cache.to_path_buf()],
954 HashingAlgorithm::MD5,
955 true,
956 );
957
958 assert!(
959 deduper.cache.get(&filename_with_newline).is_some(),
960 "File with newline is missing from cache"
961 );
962
963 assert!(
964 deduper.cache.get(&filename_with_japanese).is_some(),
965 "File with Japanese is missing from cache"
966 );
967
968 deduper.write_chunks(deduped.to_path_buf(), 3)?;
969
970 deduper.write_cache();
971
972 Ok(())
973 };
974
975 try_dedup()?;
977
978 try_dedup()?;
980
981 Ok(())
982 }
983}