use crate::cli::Cli;
use crate::printer;
use crate::types::{CompFile, ComparisonFn, FileCache, Match, Matches, MatchesLookup};
use std::collections::HashMap;
const INSERTION_COST: usize = 1;
const DELETION_COST: usize = 1;
const SUBSTITUTION_COST: usize = 1;
pub fn comparison_lambda(args: &Cli) -> ComparisonFn {
let threshold = args.lev_threshold;
if threshold == 0 {
Box::new(move |x, y| x == y)
} else {
Box::new(move |x, y| levenshtein_distance(x, y, threshold) <= threshold)
}
}
fn get_max_block_size(comp: &ComparisonFn, f1: &CompFile, f2: &CompFile) -> usize {
let mut block_length = 1;
loop {
let i1 = f1.start + block_length;
let i2 = f2.start + block_length;
if f1.file == f2.file && i1 == f2.start {
return block_length;
}
if i1 >= f1.lines.len() || i2 >= f2.lines.len() {
return block_length;
}
if comp(&f1.lines[i1], &f2.lines[i2]) {
block_length += 1;
} else {
return block_length;
}
}
}
fn update_matches(
(a, b): (Match, Match),
(where_is_match, matches_hash): (&mut MatchesLookup, &mut Matches),
) {
let mut where_is_match_to_insert = Vec::new();
let (refa, refb) = (where_is_match.0.get(&a), where_is_match.0.get(&b));
match (refa, refb, &a, &b) {
(Some(refa), Some(refb), _, _) if refa != refb => {
let mut refb_v = matches_hash.0.remove(refb).unwrap();
refb_v.push(refb.clone());
for block in &refb_v {
where_is_match_to_insert.push((block.clone(), refa.clone()));
}
matches_hash
.0
.entry(refa.clone())
.and_modify(|v| v.append(&mut refb_v));
}
(Some(refblock), None, _, b) | (None, Some(refblock), b, _) => {
matches_hash
.0
.entry(refblock.clone())
.and_modify(|v| v.push(b.clone()));
where_is_match_to_insert.push((b.clone(), refblock.clone()));
}
(None, None, a, b) => {
matches_hash.0.insert(b.clone(), vec![a.clone()]);
where_is_match_to_insert.push((a.clone(), b.clone()));
where_is_match_to_insert.push((b.clone(), b.clone()));
}
_ => {}
}
for (key, val) in where_is_match_to_insert {
where_is_match.0.insert(key, val);
}
}
fn get_matches_from_2_files(
args: &Cli,
(mut where_is_match, mut matches_hash): (MatchesLookup, Matches),
comp: &ComparisonFn,
(mut f1, mut f2): (CompFile, CompFile),
) -> (MatchesLookup, Matches) {
f1.start = 0;
while f1.start < f1.lines.len() {
printer::now_comparing(args, &f1, &f2);
if f1.current_line().len() < args.line_threshold {
f1.start += 1;
continue;
}
f2.start = if f1.file == f2.file { f1.start + 1 } else { 0 };
let mut max_block_length = 1;
while f2.start < f2.lines.len() {
if comp(f1.current_line(), f2.current_line()) {
let block_length = get_max_block_size(comp, &f1, &f2);
if block_length < args.block_threshold {
f2.start += block_length;
continue;
}
let matches = Match::from_compfiles(&f1, &f2, block_length);
update_matches(matches, (&mut where_is_match, &mut matches_hash));
f2.start += block_length;
max_block_length = std::cmp::max(max_block_length, block_length);
} else {
f2.start += 1;
}
}
f1.start += max_block_length;
}
(where_is_match, matches_hash)
}
pub fn get_all_matches(args: &Cli) -> Matches {
let mut filecache = FileCache::new();
let mut where_is_match = MatchesLookup(HashMap::new());
let mut matches_hash = Matches(HashMap::new());
let comp = comparison_lambda(args);
let mut finished_comparisons = 0;
for i in 0..args.files.len() {
for j in i..args.files.len() {
if let Some((f1, f2)) =
CompFile::from_files(&args.files[i], &args.files[j], &mut filecache)
{
(where_is_match, matches_hash) =
get_matches_from_2_files(args, (where_is_match, matches_hash), &comp, (f1, f2));
finished_comparisons += 1;
printer::done_comparison(args, finished_comparisons);
} else {
finished_comparisons += 1;
printer::skip_comparison(args, &args.files[i], &args.files[j]);
}
}
}
matches_hash
}
fn to_char_vec(s: &str) -> Vec<char> {
let mut v = Vec::with_capacity(s.len());
for c in s.chars() {
v.push(c);
}
v
}
#[allow(clippy::needless_range_loop)]
pub fn levenshtein_distance(x: &str, y: &str, threshold: usize) -> usize {
let (x, y) = (to_char_vec(x), to_char_vec(y));
let (m, n) = (x.len(), y.len());
let mut d = vec![0usize; (m + 1) * (n + 1)];
let size = m + 1;
if threshold >= std::cmp::max(m, n) {
return threshold;
}
if threshold < m.abs_diff(n) {
return threshold + 1;
}
for i in 1..(m + 1) {
d[i] = i;
}
for j in 1..(n + 1) {
d[j * size] = j;
}
for j in 1..(n + 1) {
let mut has_changed_row = false;
for i in 1..(m + 1) {
let sub_cost = if x[i - 1] == y[j - 1] {
0
} else {
SUBSTITUTION_COST
};
d[i + j * size] = std::cmp::min(
d[(i - 1) + j * size] + INSERTION_COST,
std::cmp::min(
d[i + (j - 1) * size] + DELETION_COST,
d[(i - 1) + (j - 1) * size] + sub_cost,
),
);
if d[i + j * size] <= threshold {
has_changed_row = true;
}
}
if !has_changed_row {
return threshold + 1;
}
}
d[m + n * size]
}
#[cfg(test)]
mod tests {
use super::levenshtein_distance;
macro_rules! check_lev {
( $a:literal, $b:literal, $t:literal ) => {{
check_lev!($a, $b, $t, $t);
}};
( $a:literal, $b:literal, $t:literal, $e:literal ) => {{
let dist = levenshtein_distance($a, $b, $t);
assert_eq!(
dist, $e,
"levenshtein_distance({}, {}, {}) = {}, expected {}",
$a, $b, $t, dist, $e
);
}};
}
#[test]
fn test_lev_distance() {
check_lev!("the same the same", "the same the same", 10, 0);
check_lev!("kitten", "sitting", 3);
check_lev!("train", "shine", 4);
check_lev!("a", "aaa", 2);
check_lev!("arst", "zxcv", 4);
check_lev!("ieanrstien", " ", 5, 6);
check_lev!("arstarst", "zxcv", 100, 100);
check_lev!("the same", "the same", 0);
}
}