dsc 0.1.3

dsc is a cli tool for finding and removing duplicate files on one or multiple file systems, while respecting your gitignore rules.
use std::cmp::max;

use crate::candidate_selection::hashing::HashInstructions;

#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum HashingStrategy {
    Complete,
    Sampling,
}

impl HashingStrategy {
    pub fn instructions(&self, file_size: u64) -> HashInstructions {
        HashInstructions {
            seed: 0,
            offset: 0,
            file_size,
            read_size: 16384,
        }
    }

    pub fn next_instructions(
        &self,
        instructions: &HashInstructions,
        seed: u64,
    ) -> HashInstructions {
        match self {
            HashingStrategy::Complete => {
                let target_read = instructions.read_size + instructions.offset;

                HashInstructions {
                    seed,
                    offset: target_read,
                    read_size: target_read,
                    file_size: instructions.file_size,
                }
            }
            HashingStrategy::Sampling => {
                let mut proposed_offset = max(instructions.read_size, instructions.offset << 1);
                let mut proposed_read_size = instructions.read_size;

                // If we jumped over the end of the file, don't seek further
                if proposed_offset > instructions.file_size {
                    proposed_offset = instructions.file_size - instructions.read_size
                }

                // If we can reach beyond the end of the file in a single read action
                // reach right away
                if proposed_offset + (proposed_read_size * 2) >= instructions.file_size {
                    proposed_read_size *= 2
                }

                HashInstructions {
                    seed,
                    offset: proposed_offset,
                    read_size: proposed_read_size,
                    file_size: instructions.file_size,
                }
            }
        }
    }
}

#[cfg(test)]
mod tests {
    use crate::candidate_selection::hashing::{HashInstructions, HashingStrategy};

    #[test]
    fn complete_strategy() {
        let strategy = HashingStrategy::Complete;
        let instructions = strategy.instructions(128_000_000);

        let expected_next_instructions = HashInstructions {
            seed: 3,
            offset: 16384,
            read_size: 16384,
            file_size: 128_000_000,
        };

        assert_eq!(
            expected_next_instructions,
            strategy.next_instructions(&instructions, 3)
        );

        let instructions = expected_next_instructions;
        let expected_next_instructions = HashInstructions {
            seed: 5,
            offset: 32768,
            read_size: 32768,
            file_size: 128_000_000,
        };

        assert_eq!(
            expected_next_instructions,
            strategy.next_instructions(&instructions, 5)
        )
    }

    #[test]
    fn sampling_strategy_reach_until_end() {
        let strategy = HashingStrategy::Sampling;
        let mut instructions = strategy.instructions(1700);
        instructions.read_size = 500;

        let expected_next_instructions = HashInstructions {
            seed: 3,
            offset: 500,
            read_size: 500,
            file_size: 1700,
        };

        assert_eq!(
            expected_next_instructions,
            strategy.next_instructions(&instructions, 3)
        );

        let instructions = expected_next_instructions;

        let expected_next_instructions = HashInstructions {
            seed: 5,
            offset: 1000,
            read_size: 1000,
            file_size: 1700,
        };

        assert_eq!(
            expected_next_instructions,
            strategy.next_instructions(&instructions, 5)
        );
    }

    #[test]
    fn sampling_strategy_seek_back_to_end() {
        let strategy = HashingStrategy::Sampling;
        let mut instructions = strategy.instructions(10_000);

        instructions.offset = 7500;
        instructions.read_size = 500;

        let expected_next_instructions = HashInstructions {
            seed: 3,
            offset: 9500,
            read_size: 1000,
            file_size: 10_000,
        };

        assert_eq!(
            expected_next_instructions,
            strategy.next_instructions(&instructions, 3)
        );
    }
}