fuzzyhash 0.1.1

Pure Rust fuzzy hash implementation
Documentation
use std::cmp::{min,max};
use constants;
use roll::{Roll};

const MAX_LENGTH: usize = 64;
const INSERT_COST: u32 = 1;
const REMOVE_COST: u32 = 1;
const REPLACE_COST: u32 = 2;

fn compute_distance(s1: Vec<u8>, s2: Vec<u8>) -> u32 {
    let mut t1: Vec<u32> = vec![0; MAX_LENGTH + 1];
    let mut t2: Vec<u32> = vec![0; MAX_LENGTH + 1];
    
    for i2 in 0..s2.len()+1 {
        t1[i2] = i2 as u32 * REMOVE_COST;
    }
    for i1 in 0..s1.len() {
        t2[0] = (i1 as u32 + 1) * INSERT_COST;
        for i2 in 0..s2.len() {
            let cost_a = t1[i2+1] + INSERT_COST;
            let cost_d = t2[i2] + REMOVE_COST;
            let cost_r = t1[i2] + if s1[i1] == s2[i2] {
                0
            } else {
                REPLACE_COST
            };
            t2[i2+1] = min(min(cost_a, cost_d), cost_r);
        }
        let t3 = t1.clone();
        t1 = t2.clone();
        t2 = t3;
    }
    t1[s2.len()]
}

fn has_common_substring(first: &[u8], second: &[u8]) -> bool {
    let first_length = first.to_vec().len();
    let second_length = second.to_vec().len();
    let mut i: usize = 0;
    let mut hashes: Vec<u32> = vec![0; constants::SPAM_SUM_LENGTH as usize];
    let mut state = Roll::new();

    while i < first_length && first[i] != 0 {
        state.hash(first[i]);
        hashes[i] = state.sum();
        i += 1;
    }

    let num_hashes = i;
    state = Roll::new();

    i = 0;
    while i < second_length && second[i] != 0 {
        state.hash(second[i]);
        let h = state.sum();

        if i < constants::ROLLING_WINDOW - 1 {
            i += 1;
            continue;
        }

        for j in (constants::ROLLING_WINDOW - 1)..num_hashes {
            if hashes[j] != 0 && hashes[j] == h {
                let second_start_pos = i.wrapping_sub(constants::ROLLING_WINDOW).wrapping_add(1);
                let mut len = 0;
                while len + second_start_pos < second_length &&
                    second[len + second_start_pos] != 0 {
                        len += 1;
                    }
                
                if len < constants::ROLLING_WINDOW {
                    continue;
                }

                let mut matched = true;
                let first_start_pos = j.wrapping_sub(constants::ROLLING_WINDOW).wrapping_add(1);
                for pos in 0..constants::ROLLING_WINDOW {
                    let first_char = first[first_start_pos + pos];
                    let second_char = second[second_start_pos + pos];

                    if first_char != second_char {
                        matched = false;
                        break;
                    }

                    if first_char == 0 {
                        break;
                    }
                }
                if matched {
                    return true;
                }
            }
        }
        i += 1;
    }
    false 
}

fn eliminate_sequences(input: Vec<u8>) -> Vec<u8> {
    let mut result: Vec<u8> = vec![0; input.len()];
    let mut i = 0;

    while i < 3 && i < input.len() {
        result[i] = input[i];
        i += 1;
    }

    if input.len() < 3 {
        return result
    }

    i = 3;
    let mut j = 3;

    while i < input.len() {
        let current = input[j];
        if current != input[i - 1] || current != input[i - 2] || current != input[i - 3] {
            result[j] = input[i];
            j += 1;
        }
        i += 1;
    }

    unsafe {
        result.set_len(j);
    }
    result
}

pub fn score_strings(first: Vec<u8>, second: Vec<u8>, block_size: u32) -> u32 {

    if first.len() > constants::SPAM_SUM_LENGTH as usize || 
        second.len() > constants::SPAM_SUM_LENGTH as usize {
        return 0;
    }

    if !has_common_substring(&first, &second) {
        println!("no common substring!");
        return 0;
    }

    let mut score = compute_distance(first.clone(), second.clone());
    score = (score * constants::SPAM_SUM_LENGTH) / (( first.len() + second.len() ) as u32);
    score = (100 * score) / 64;
    if score >= 100 {
        return 0;
    }

    score = 100 - score;
    
    let match_size = block_size / constants::MIN_BLOCK_SIZE * (min(first.len(), second.len()) as u32);
    if score > match_size {
        match_size
    } else {
         score
    }
}

pub fn strings(first: String, second: String) -> u32 {
    let first_parts: Vec<&str> = first.split(':').collect();
    let second_parts: Vec<&str> = second.split(':').collect();

    if first_parts.len() != 3 && second_parts.len() != 3 {
        println!("Badly formatted input strings!");
        return 0;
    }

    let first_block_size = match first_parts[0].parse::<u32>() {
        Ok(s) => s,
        Err(_) => {
            println!("Cannot parse first string's block size!");
            0
        }
    };
    let second_block_size = match second_parts[0].parse::<u32>() {
        Ok(s) => s,
        Err(_) => {
            println!("Cannot parse second string's block size!");
            0
        }
    };

    if first_block_size != second_block_size &&
        first_block_size != second_block_size * 2 &&
            second_block_size != first_block_size * 2 {
                println!("Incompatible block sizes!");
                return 0
            }

    let first_block1 = eliminate_sequences(first_parts[1].as_bytes().to_vec());
    let first_block2 = eliminate_sequences(first_parts[2].as_bytes().to_vec());
    
    let second_block1 = eliminate_sequences(second_parts[1].as_bytes().to_vec());
    let second_block2 = eliminate_sequences(second_parts[2].as_bytes().to_vec());

    if first_block_size == second_block_size && first_block1.len() == second_block1.len() {
        let mut matched = true;
        for i in 0..first_block1.len() {
            if first_block1[i] != second_block1[i] {
                matched = false;
                break;
            }
        }
        if matched {
            return 100;
        }
    }

    if first_block_size == second_block_size {
        let score1 = score_strings(first_block1, second_block1, first_block_size);
        let score2 = score_strings(first_block2, second_block2, first_block_size * 2);
        return max(score1, score2);
    }
    else if first_block_size == second_block_size * 2 {
        return score_strings(first_block1, second_block2, first_block_size);
    }
    else {
        return score_strings(first_block2, second_block1, second_block_size);
    }
}