1use std::cell::OnceCell;
200use std::collections::hash_map::IntoIter;
201use std::collections::{HashMap, HashSet};
202use std::fs::File;
203use std::io::{BufReader, BufWriter, Read, Seek, SeekFrom, Write};
204use std::path::{Path, PathBuf};
205use std::sync::Arc;
206use std::time::SystemTime;
207
208use file_declutter::FileDeclutter;
209use rayon::prelude::*;
210use serde::{Deserialize, Serialize};
211use thiserror::Error;
212use walkdir::WalkDir;
213
214mod cache;
215
216#[derive(Debug, Error)]
217pub enum Error {
218 #[error(transparent)]
219 Io(#[from] std::io::Error),
220}
221
222type Result<R> = std::result::Result<R, Error>;
223
224#[cfg(unix)]
225fn read_at_chunk(file: &File, offset: u64, len: usize) -> std::io::Result<Vec<u8>> {
226 use std::os::unix::fs::FileExt;
227 let mut buf = vec![0u8; len];
228 let mut pos = 0usize;
229 while pos < len {
230 let read = file.read_at(&mut buf[pos..], offset + pos as u64)?;
231 if read == 0 {
232 break;
233 }
234 pos += read;
235 }
236 buf.truncate(pos);
237 Ok(buf)
238}
239
240#[cfg(windows)]
241fn read_at_chunk(file: &File, offset: u64, len: usize) -> std::io::Result<Vec<u8>> {
242 use std::os::windows::fs::FileExt;
243 let handle = file.try_clone()?;
245 let mut buf = vec![0u8; len];
246 let mut pos = 0usize;
247 while pos < len {
248 let read = handle.seek_read(&mut buf[pos..], offset + pos as u64)?;
249 if read == 0 {
250 break;
251 }
252 pos += read;
253 }
254 buf.truncate(pos);
255 Ok(buf)
256}
257
258#[derive(Clone, Copy, Debug, Default, Deserialize, Serialize)]
260pub enum HashingAlgorithm {
261 MD5,
262 #[default]
263 SHA1,
264 SHA256,
265 SHA512,
266}
267
268impl HashingAlgorithm {
269 fn select_hasher(&self) -> Box<dyn sha2::digest::DynDigest> {
271 match self {
272 Self::MD5 => Box::new(md5::Md5::default()),
273 Self::SHA1 => Box::new(sha1::Sha1::default()),
274 Self::SHA256 => Box::new(sha2::Sha256::default()),
275 Self::SHA512 => Box::new(sha2::Sha512::default()),
276 }
277 }
278}
279
280#[derive(Clone, Debug)]
282pub struct FileWithChunks {
283 base: PathBuf,
284 pub path: String,
286 pub size: u64,
288 pub mtime: SystemTime,
290 chunks: OnceCell<Vec<FileChunk>>,
291 hashing_algorithm: HashingAlgorithm,
292}
293
294impl PartialEq for FileWithChunks {
295 fn eq(&self, other: &Self) -> bool {
296 self.path == other.path && self.size == other.size && self.mtime == other.mtime
297 }
298}
299
300impl Eq for FileWithChunks {}
301
302impl FileWithChunks {
303 pub fn try_new(
305 source_path: impl Into<PathBuf>,
306 path: impl Into<PathBuf>,
307 hashing_algorithm: HashingAlgorithm,
308 ) -> Result<Self> {
309 let base = source_path.into();
310
311 let path = path.into();
312 let metadata = path.metadata()?;
313
314 let path = path
315 .strip_prefix(&base)
316 .unwrap()
317 .to_string_lossy()
318 .to_string();
319 let size = metadata.len();
320 let mtime = metadata.modified()?;
321
322 Ok(Self {
323 base,
324 path,
325 size,
326 mtime,
327 chunks: Default::default(),
328 hashing_algorithm,
329 })
330 }
331
332 pub fn get_chunks(&self) -> Option<&Vec<FileChunk>> {
334 self.chunks.get()
335 }
336
337 pub fn take_chunks(&mut self) -> Option<Vec<FileChunk>> {
339 self.chunks.take()
340 }
341
342 pub fn get_or_calculate_chunks(&self) -> Result<&Vec<FileChunk>> {
344 if self.chunks.get().is_none() {
345 let chunks = self.calculate_chunks()?;
346
347 self.chunks.set(chunks).unwrap();
349 }
350
351 Ok(self.chunks.get().unwrap())
352 }
353
354 fn calculate_chunks(&self) -> Result<Vec<FileChunk>> {
355 let path = self.base.join(&self.path);
356
357 let size = path.metadata()?.len();
358
359 let hashing_algorithm = self.hashing_algorithm;
360
361 let chunk_size = 1024 * 1024;
363 if size == 0 {
364 let hasher = hashing_algorithm.select_hasher();
365 let hash = hasher.finalize();
366 let hash = base16ct::lower::encode_string(&hash);
367
368 std::iter::once(Ok::<FileChunk, Error>(FileChunk::new(0, 0, hash))).collect()
369 } else {
370 let file = Arc::new(File::open(&path)?);
372 let total_chunks = (size + chunk_size - 1) / chunk_size;
373
374 (0..total_chunks)
375 .into_par_iter()
376 .map(|chunk_idx| {
377 let offset = chunk_idx * chunk_size;
378 let len = chunk_size.min(size.saturating_sub(offset)) as usize;
379
380 let data = read_at_chunk(&file, offset, len)?;
381
382 let mut hasher = hashing_algorithm.select_hasher();
383 hasher.update(&data);
384 let hash = hasher.finalize();
385 let hash = base16ct::lower::encode_string(&hash);
386
387 Ok::<FileChunk, Error>(FileChunk::new(offset, data.len() as u64, hash))
388 })
389 .collect()
390 }
391 }
392}
393
394#[derive(Clone, Debug)]
396pub struct FileChunk {
397 pub start: u64,
398 pub size: u64,
399 pub hash: String,
400 pub path: Option<String>,
401}
402
403impl FileChunk {
404 pub fn new(start: u64, size: u64, hash: String) -> Self {
406 Self {
407 start,
408 size,
409 hash,
410 path: None,
411 }
412 }
413}
414
415pub struct DedupCache(HashMap<String, FileWithChunks>);
417
418impl DedupCache {
419 fn new() -> Self {
421 Self(HashMap::new())
422 }
423
424 fn from_hashmap(hash_map: HashMap<String, FileWithChunks>) -> Self {
426 Self(hash_map)
427 }
428
429 fn read_from_file(&mut self, path: impl AsRef<Path>) {
431 let cache_from_file = cache::read_from_file(path);
432
433 for x in cache_from_file {
434 self.insert(x.path.clone(), x);
435 }
436 }
437
438 fn write_to_file(&self, path: impl AsRef<Path>) {
440 cache::write_to_file(path, self);
441 }
442
443 pub fn get_chunks(&self) -> Result<impl Iterator<Item = (String, FileChunk, bool)> + '_> {
446 Ok(self.values().flat_map(|fwc| {
447 let mut dirty = fwc.get_chunks().is_none();
448
449 fwc.get_or_calculate_chunks()
450 .unwrap()
451 .iter()
452 .map(move |chunk| {
453 let result = (
454 chunk.hash.clone(),
455 FileChunk {
456 path: Some(fwc.path.clone()),
457 ..chunk.clone()
458 },
459 dirty,
460 );
461
462 dirty = false;
463
464 result
465 })
466 }))
467 }
468
469 pub fn get(&self, path: &str) -> Option<&FileWithChunks> {
470 self.0.get(path)
471 }
472
473 pub fn get_mut(&mut self, path: &str) -> Option<&mut FileWithChunks> {
474 self.0.get_mut(path)
475 }
476
477 fn insert(&mut self, path: String, fwc: FileWithChunks) {
478 self.0.insert(path, fwc);
479 }
480
481 pub fn contains_key(&self, path: &str) -> bool {
482 self.0.contains_key(path)
483 }
484
485 pub fn into_iter(self) -> IntoIter<String, FileWithChunks> {
486 self.0.into_iter()
487 }
488
489 pub fn values(&self) -> impl Iterator<Item = &FileWithChunks> {
490 self.0.values()
491 }
492
493 pub fn len(&self) -> usize {
494 self.0.len()
495 }
496}
497
498pub struct Deduper {
501 source_path: PathBuf,
502 cache_path: PathBuf,
503 pub cache: DedupCache,
504}
505
506impl Deduper {
507 pub fn new(
512 source_path: impl Into<PathBuf>,
513 cache_paths: Vec<impl Into<PathBuf>>,
514 hashing_algorithm: HashingAlgorithm,
515 same_file_system: bool,
516 ) -> Self {
517 let source_path = source_path.into();
518
519 let mut cache = DedupCache::new();
520
521 let cache_path = {
522 let mut cache_path = Default::default();
523 for cache_path_from_iter in cache_paths.into_iter().rev() {
524 cache_path = cache_path_from_iter.into();
525 cache.read_from_file(&cache_path);
526 }
527 cache_path
528 };
529
530 cache = DedupCache::from_hashmap(
531 cache
532 .into_iter()
533 .filter(|(path, _)| source_path.join(path).exists())
534 .collect(),
535 );
536
537 let dir_walker = WalkDir::new(&source_path)
538 .min_depth(1)
539 .same_file_system(same_file_system);
540
541 for entry in dir_walker {
542 let entry = entry.unwrap().into_path();
543
544 if !entry.is_file() {
545 continue;
546 }
547
548 let fwc = FileWithChunks::try_new(&source_path, &entry, hashing_algorithm).unwrap();
549
550 if let Some(fwc_cache) = cache.get_mut(&fwc.path) {
551 if fwc == *fwc_cache {
552 fwc_cache.base = source_path.clone();
553 continue;
554 }
555 }
556
557 cache.insert(fwc.path.clone(), fwc);
558 }
559
560 Self {
561 source_path,
562 cache_path,
563 cache,
564 }
565 }
566
567 pub fn write_cache(&self) {
569 if self.cache_path.file_name().is_none() {
570 return;
571 }
572
573 let temp_path = self.cache_path.clone().with_extension(format!(
574 "tmp.{}.{}",
575 SystemTime::now()
576 .duration_since(SystemTime::UNIX_EPOCH)
577 .unwrap()
578 .as_millis(),
579 self.cache_path
580 .extension()
581 .unwrap_or("ext".as_ref())
582 .to_str()
583 .unwrap()
584 ));
585 self.cache.write_to_file(&temp_path);
586 std::fs::rename(temp_path, &self.cache_path).unwrap();
587 }
588
589 pub fn write_chunks(
592 &mut self,
593 target_path: impl Into<PathBuf>,
594 declutter_levels: usize,
595 ) -> Result<()> {
596 let target_path = target_path.into();
597 let data_dir = target_path.join("data");
598 std::fs::create_dir_all(&data_dir)?;
599 for (_, chunk, _) in self.cache.get_chunks()? {
600 let mut chunk_file = PathBuf::from(&chunk.hash);
601 if declutter_levels > 0 {
602 chunk_file = FileDeclutter::oneshot(chunk_file, declutter_levels);
603 }
604 chunk_file = data_dir.join(chunk_file);
605
606 if !chunk_file.exists() {
607 std::fs::create_dir_all(&chunk_file.parent().unwrap())?;
608 let mut out = File::create(chunk_file)?;
609 let mut src = BufReader::new(File::open(
610 self.source_path.join(chunk.path.as_ref().unwrap()),
611 )?);
612 src.seek(SeekFrom::Start(chunk.start))?;
613 let mut limited = src.take(chunk.size);
614 std::io::copy(&mut limited, &mut out)?;
615 }
616 }
617
618 Ok(())
619 }
620}
621
622pub struct Hydrator {
624 source_path: PathBuf,
625 pub cache: DedupCache,
626}
627
628impl Hydrator {
629 pub fn new(source_path: impl Into<PathBuf>, cache_paths: Vec<impl Into<PathBuf>>) -> Self {
631 let source_path = source_path.into();
632
633 let mut cache = DedupCache::new();
634
635 for cache_path in cache_paths.into_iter().rev() {
636 let cache_path = cache_path.into();
637 cache.read_from_file(&cache_path);
638 }
639
640 Self { source_path, cache }
641 }
642
643 pub fn restore_files(&self, target_path: impl Into<PathBuf>, declutter_levels: usize) {
646 let data_dir = self.source_path.join("data");
647 let target_path = target_path.into();
648 std::fs::create_dir_all(&target_path).unwrap();
649 for fwc in self.cache.values() {
650 let target = target_path.join(&fwc.path);
651 std::fs::create_dir_all(&target.parent().unwrap()).unwrap();
652 let target_file = File::create(&target).unwrap();
653 let mut target = BufWriter::new(&target_file);
654 for chunk in fwc.get_chunks().unwrap() {
655 let mut chunk_file = PathBuf::from(&chunk.hash);
656 if declutter_levels > 0 {
657 chunk_file = FileDeclutter::oneshot(chunk_file, declutter_levels);
658 }
659 chunk_file = data_dir.join(chunk_file);
660
661 let mut source = File::open(chunk_file).unwrap();
662 std::io::copy(&mut source, &mut target).unwrap();
663 }
664 target.flush().unwrap();
665 target_file.set_modified(fwc.mtime).unwrap()
666 }
667 }
668
669 pub fn list_missing_chunks(
671 &self,
672 declutter_levels: usize,
673 ) -> impl Iterator<Item = (PathBuf, String)> {
674 let mut hashes_and_chunks = self
675 .cache
676 .get_chunks()
677 .unwrap()
678 .map(|(hash, chunk, ..)| (PathBuf::from(hash), chunk))
679 .collect::<Vec<_>>();
680 hashes_and_chunks.sort_by(|a, b| a.0.cmp(&b.0));
681 hashes_and_chunks.dedup_by(|a, b| a.0 == b.0);
682
683 let (hashes, chunks): (Vec<_>, Vec<_>) = hashes_and_chunks.into_iter().unzip();
684
685 let files_in_cache = FileDeclutter::new_from_iter(hashes.into_iter())
686 .base(&self.source_path.join("data"))
687 .levels(declutter_levels)
688 .map(|(_, path)| path);
689
690 files_in_cache
691 .zip(chunks)
692 .into_iter()
693 .filter_map(|(path, chunk)| {
694 if !path.exists() {
695 Some((path, "Does not exist".to_string()))
696 } else if path.metadata().unwrap().len() != chunk.size {
697 Some((
698 path,
699 format!("Does not have expected size of {}", chunk.size),
700 ))
701 } else {
702 None
703 }
704 })
705 }
706
707 pub fn check_cache(&self, declutter_levels: usize) -> bool {
709 self.list_missing_chunks(declutter_levels).next().is_none()
710 }
711
712 pub fn list_extra_files(&self, declutter_levels: usize) -> impl Iterator<Item = PathBuf> {
714 let files_in_cache = FileDeclutter::new_from_iter(
715 self.cache
716 .get_chunks()
717 .unwrap()
718 .map(|(hash, ..)| PathBuf::from(hash)),
719 )
720 .base(&self.source_path.join("data"))
721 .levels(declutter_levels)
722 .map(|(_, path)| path)
723 .collect::<HashSet<_>>();
724
725 WalkDir::new(&self.source_path.join("data"))
726 .min_depth(1)
727 .same_file_system(false)
728 .into_iter()
729 .filter(move |entry| {
730 entry
731 .as_ref()
732 .map(|entry| {
733 entry.file_type().is_file() && !files_in_cache.contains(entry.path())
734 })
735 .unwrap_or_default()
736 })
737 .flatten()
738 .map(|entry| entry.into_path())
739 }
740
741 pub fn delete_extra_files(&self, declutter_levels: usize) -> anyhow::Result<()> {
743 for path in self.list_extra_files(declutter_levels) {
744 std::fs::remove_file(&path)?;
745 }
746
747 Ok(())
748 }
749}
750
751#[cfg(test)]
752mod tests {
753 use std::fs::OpenOptions;
754
755 use assert_fs::fixture::ChildPath;
756 use assert_fs::prelude::*;
757 use assert_fs::{NamedTempFile, TempDir};
758
759 use super::*;
760
761 fn setup() -> anyhow::Result<(TempDir, ChildPath, ChildPath, ChildPath)> {
762 let temp = TempDir::new()?;
763
764 let origin = temp.child("origin");
765 origin.create_dir_all()?;
766 origin.child("README.md").write_str("Hello, world!")?;
767
768 let deduped = temp.child("deduped");
769 deduped.create_dir_all()?;
770
771 let cache = temp.child("cache.json");
772
773 {
774 let mut deduper = Deduper::new(
775 origin.to_path_buf(),
776 vec![cache.to_path_buf()],
777 HashingAlgorithm::MD5,
778 true,
779 );
780 deduper.write_chunks(deduped.to_path_buf(), 3)?;
781 deduper.write_cache();
782 }
783
784 Ok((temp, origin, deduped, cache))
785 }
786
787 #[test]
788 fn compare_filechunk_objects() -> anyhow::Result<()> {
789 let temp = TempDir::new()?;
790
791 let file_1 = temp.child("file_1");
792 std::fs::write(&file_1, "content_1")?;
793
794 let file_2 = temp.child("file_2");
795 std::fs::write(&file_2, "content_2")?;
796
797 let fwc_1 = FileWithChunks::try_new(&temp.path(), &file_1.path(), HashingAlgorithm::MD5)?;
798 let fwc_1_same =
799 FileWithChunks::try_new(&temp.path(), &file_1.path(), HashingAlgorithm::MD5)?;
800 let fwc_2 = FileWithChunks::try_new(&temp.path(), &file_2.path(), HashingAlgorithm::MD5)?;
801
802 assert_eq!(fwc_1, fwc_1);
803 assert_eq!(fwc_1, fwc_1_same);
804 assert_ne!(fwc_1, fwc_2);
805
806 OpenOptions::new()
809 .write(true)
810 .open(&file_1)?
811 .set_modified(SystemTime::now())?;
812
813 let fwc_1_new =
814 FileWithChunks::try_new(&temp.path(), &file_1.path(), HashingAlgorithm::MD5)?;
815
816 assert_ne!(fwc_1, fwc_1_new);
817
818 Ok(())
819 }
820
821 #[test]
822 fn check_all_hashing_algorithms() -> anyhow::Result<()> {
823 let algorithms = &[
824 (HashingAlgorithm::MD5, "0fb073cd346f46f60c15e719f3820482"),
825 (
826 HashingAlgorithm::SHA1,
827 "5503f5edc1bba66a7733c5ec38f4e9d449021be9",
828 ),
829 (
830 HashingAlgorithm::SHA256,
831 "e8c73ac958a87f17906b092bd99f37038788ee23b271574aad6d5bf1c76cc61c",
832 ),
833 (
834 HashingAlgorithm::SHA512,
835 "e6eda213df25f96ca380dd07640df530574e380c1b93d5d863fec05d5908a4880a3075fef4a438cfb1023cc51affb4624002f54b4790fe8362c7de032eb39aaa",
836 ),
837 ];
838
839 let temp = TempDir::new()?;
840 let file = temp.child("file");
841 std::fs::write(&file, "hello rust")?;
842
843 for (algorithm, expected_hash) in algorithms.iter().copied() {
844 let cache_file = NamedTempFile::new("cache.json")?;
845
846 let chunks = Deduper::new(temp.path(), vec![cache_file.path()], algorithm, true)
847 .cache
848 .get_chunks()?
849 .collect::<Vec<_>>();
850
851 assert_eq!(chunks.len(), 1, "Too many chunks");
852
853 let (hash, _, _) = &chunks[0];
854 assert_eq!(
855 hash, &expected_hash,
856 "Algorithm {:?} does not produce expected hash",
857 algorithm
858 );
859 }
860
861 Ok(())
862 }
863
864 #[test]
865 fn check_cache() -> anyhow::Result<()> {
866 let (_temp, _origin, deduped, cache) = setup()?;
867
868 assert!(
869 Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()]).check_cache(3),
870 "Cache checking failed when it shouldn't"
871 );
872
873 std::fs::remove_dir_all(deduped.child("data").read_dir()?.next().unwrap()?.path())?;
874
875 assert!(
876 !Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()]).check_cache(3),
877 "Cache checking didn't fail when it should"
878 );
879
880 Ok(())
881 }
882
883 #[test]
884 fn check_list_extra() -> anyhow::Result<()> {
885 let (_temp, _origin, deduped, cache) = setup()?;
886
887 assert_eq!(
888 Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()])
889 .list_extra_files(3)
890 .count(),
891 0,
892 "Extra files present when there shouldn't be"
893 );
894
895 deduped
896 .child("data")
897 .child("extra_file")
898 .write_str("Hello, world!")?;
899
900 assert_eq!(
901 Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()])
902 .list_extra_files(3)
903 .count(),
904 1,
905 "Number of extra files present is not 1"
906 );
907
908 deduped
909 .child("data")
910 .child("e")
911 .child("x")
912 .child("t")
913 .child("extra_file")
914 .write_str("Hello, world!")?;
915
916 assert_eq!(
917 Hydrator::new(deduped.to_path_buf(), vec![cache.to_path_buf()])
918 .list_extra_files(3)
919 .count(),
920 2,
921 "Number of extra files present is not 2"
922 );
923
924 Ok(())
925 }
926
927 #[cfg(not(windows))]
928 #[test]
929 fn check_files_with_exotic_characters() -> anyhow::Result<()> {
930 let temp = TempDir::new()?;
931
932 let origin = temp.child("origin");
933 origin.create_dir_all()?;
934
935 let deduped = temp.child("deduped");
936 deduped.create_dir_all()?;
937
938 let cache = temp.child("cache.json");
939
940 let filename_with_newline = "new\nline.txt";
941 let file_with_newline = origin.child(filename_with_newline);
942 file_with_newline.write_str("content")?;
943
944 let filename_with_japanese = "日本語ファイル名";
945 let file_with_japanese = origin.child(filename_with_japanese);
946 file_with_japanese.write_str("content")?;
947
948 let try_dedup = || -> anyhow::Result<()> {
949 let mut deduper = Deduper::new(
950 origin.to_path_buf(),
951 vec![cache.to_path_buf()],
952 HashingAlgorithm::MD5,
953 true,
954 );
955
956 assert!(
957 deduper.cache.get(&filename_with_newline).is_some(),
958 "File with newline is missing from cache"
959 );
960
961 assert!(
962 deduper.cache.get(&filename_with_japanese).is_some(),
963 "File with Japanese is missing from cache"
964 );
965
966 deduper.write_chunks(deduped.to_path_buf(), 3)?;
967
968 deduper.write_cache();
969
970 Ok(())
971 };
972
973 try_dedup()?;
975
976 try_dedup()?;
978
979 Ok(())
980 }
981}