dsc 0.1.3

dsc is a cli tool for finding and removing duplicate files on one or multiple file systems, while respecting your gitignore rules.
use std::fs::File;
use std::path::PathBuf;

use anyhow::{anyhow, Context, Result};

use crate::candidate_selection::collision_tracker::CollisionType::{Occupied, Terminal};
use crate::candidate_selection::collision_tracker::FollowUpWork::{Nothing, One, Two};
use crate::candidate_selection::messages::{FileWithDescriptor, HashComputedResult};
use crate::file_descriptor::FileDescriptor;
use rustc_hash::{FxHashMap, FxHashSet};

#[derive(Hash, Eq, PartialEq, Debug)]
struct PartialCollisionKey {
    hash: u64,
    offset: u64,
    block_size: u64,
    file_size: u64,
}

#[derive(Hash, Eq, PartialEq, Debug)]
pub struct FullCollisionKey {
    pub file_size: u64,
    hash: u64,
}

#[derive(Hash, Eq, PartialEq, Debug)]
enum CollisionType {
    Terminal,
    Occupied(FileDescriptor),
}

pub struct CollisionTracker {
    pub handles: FxHashMap<FileDescriptor, FxHashSet<PathBuf>>,
    pub duplicates: FxHashMap<FullCollisionKey, FxHashSet<FileDescriptor>>,
    hits: FxHashMap<PartialCollisionKey, CollisionType>,
    has_hash: FxHashSet<FileDescriptor>,
}

pub enum FollowUpWork {
    Nothing,
    One(FileWithDescriptor),
    Two(FileWithDescriptor, FileWithDescriptor),
}

impl CollisionTracker {
    pub fn new() -> Self {
        CollisionTracker {
            handles: FxHashMap::default(),
            duplicates: FxHashMap::default(),
            hits: FxHashMap::default(),
            has_hash: FxHashSet::default(),
        }
    }

    pub fn remove_file(&mut self, file_with_descriptor: FileWithDescriptor) {
        if self
            .has_hash
            .contains(&file_with_descriptor.file_descriptor)
        {
            warn!("Removing partially hashed file {:?}", file_with_descriptor)
        } else {
            self.handles.remove(&file_with_descriptor.file_descriptor);
        }
    }

    pub fn update_hash(
        &mut self,
        file_with_descriptor: FileWithDescriptor,
        hash_result: &HashComputedResult,
    ) -> Result<FollowUpWork> {
        self.has_hash.insert(file_with_descriptor.file_descriptor);
        let block_size = hash_result.instructions.read_size;
        let file_size = hash_result.instructions.file_size;
        let offset = hash_result.instructions.offset;
        let hash = hash_result.hash;
        let file_descriptor = file_with_descriptor.file_descriptor;

        let key = PartialCollisionKey {
            hash,
            offset,
            block_size,
            file_size,
        };

        if let Some(hit) = self.hits.get(&key) {
            match hit {
                // We have found a partial collision, but we need to do investigate further
                Occupied(_) if offset + block_size < file_size => {
                    let hit = self.hits.insert(key, Terminal);

                    if let Some(Occupied(occupying_handle)) = hit {
                        trace!("Reopening known file");

                        let reopened_handle = self
                            .handles
                            .get(&occupying_handle)
                            .context("Expected FileDescriptor was not present")?
                            .iter()
                            .next()
                            .map(File::open)
                            .transpose()?
                            .map(|file| FileWithDescriptor {
                                file,
                                file_descriptor: occupying_handle,
                            })
                            .context("Could not reopen file")?;

                        trace!("Returning opened file");

                        Ok(Two(file_with_descriptor, reopened_handle))
                    } else {
                        Err(anyhow!("Unexpected mismatch in collision tracker"))
                    }
                }

                // It is occupied by a file that was fully hashed, no more work can be done
                Occupied(_) => {
                    // nothing more to expand, register the full collision key - terminal
                    let dup_key = FullCollisionKey { file_size, hash };

                    let hit = self.hits.insert(key, Terminal);

                    if let Some(Occupied(occupying_handle)) = hit {
                        if let Some(set) = self.duplicates.get_mut(&dup_key) {
                            set.insert(file_descriptor);
                            set.insert(occupying_handle);
                        } else {
                            let mut set = FxHashSet::default();
                            set.insert(file_descriptor);
                            set.insert(occupying_handle);
                            self.duplicates.insert(dup_key, set);
                        }
                    }
                    Ok(Nothing)
                }

                // We had a collision before, but the files occupying this slot were promoted
                Terminal if offset + block_size < file_size => Ok(One(file_with_descriptor)),

                // this one is already colliding
                Terminal => {
                    let dup_key = FullCollisionKey { file_size, hash };

                    if let Some(set) = self.duplicates.get_mut(&dup_key) {
                        set.insert(file_descriptor);
                    }
                    Ok(Nothing)
                }
            }
        } else {
            self.hits.insert(key, Occupied(file_descriptor));
            Ok(Nothing)
        }
    }

    pub fn register_path(&mut self, file_descriptor: FileDescriptor, path: PathBuf) -> bool {
        if let Some(filenames) = self.handles.get_mut(&file_descriptor) {
            filenames.insert(path);
            false
        } else {
            let mut filenames = FxHashSet::default();
            filenames.insert(path);
            self.handles.insert(file_descriptor, filenames);
            true
        }
    }
}