rapidfuzz 0.1.0

rapid string matching library
Documentation
use crate::levenshtein::{normalized_weighted_levenshtein, weighted_levenshtein};
use crate::editops;

pub fn ratio(query: &str, choice: &str, limit: u8) -> u8 {

	if query == choice {
		return 100;
	}

	// when the limit is 100 it can not match better
	if query.is_empty() || choice.is_empty() || limit == 100 {
		return 0;
	}

	//_token_set(query, choice);
	let len_a = query.len() as f32;
	let len_b = choice.len() as f32;
	let len_ratio = if len_a > len_b { 
		len_a / len_b 
	} else {  
		len_b / len_a
	};

	if len_ratio < 1.5 {
        full_ratio(query, choice, limit)
    } else if len_ratio < 8.0 {
        partial_ratio(query, choice, 0.9, limit)
    } else {
        partial_ratio(query, choice, 0.6, limit)
    }
}


fn partial_ratio(query: &str, choice: &str, partial_scale: f32, limit: u8) -> u8 {
	let mut sratio = normalized_weighted_levenshtein(query, choice);

	const UNBASE_SCALE: f32 = 0.95;
	let mut min_ratio = (limit as f32 / 100.0).max(sratio);
	if min_ratio < UNBASE_SCALE {
		sratio = sratio.max(partial_string_ratio(query, choice) * UNBASE_SCALE);

		min_ratio = min_ratio.max(sratio);
		if min_ratio < UNBASE_SCALE * partial_scale {
			sratio = sratio.max(partial_token_ratio(query, choice) * UNBASE_SCALE * partial_scale )
		}

	}	

	(sratio * 100.0).round() as u8
}


fn full_ratio(query: &str, choice: &str, limit: u8) -> u8{
	let mut sratio = normalized_weighted_levenshtein(query, choice);

	const UNBASE_SCALE: f32 = 0.95;
	let min_ratio = (limit as f32 / 100.0).max(sratio);
	if min_ratio < UNBASE_SCALE {
		sratio = sratio.max(token_ratio(query, choice) * UNBASE_SCALE);
	}

	(sratio * 100.0).round() as u8
}


fn partial_string_ratio(query: &str, choice: &str) -> f32 {
	if query.is_empty() || choice.is_empty() {
		return 0.0;
	}

	let shorter;
	let longer;
	
	if query.len() <= choice.len() {
		shorter = query;
		longer = choice;
	} else {
		longer = query;
		shorter = choice;
	}

	let edit_ops = editops::editops_find(shorter, longer);
	let blocks = editops::editops_matching_blocks(shorter.len(), longer.len(), &edit_ops);

	let mut scores: Vec<f32> = vec![];
	for block in blocks {
		let long_start = if block.second_start > block.first_start {
			block.second_start - block.first_start
		} else {
			0
		};

		let long_end = long_start + shorter.chars().count();
		let long_substr = &longer[long_start..long_end];

		let ls_ratio = normalized_weighted_levenshtein(shorter, long_substr);
	
		if ls_ratio > 0.995 {
			return 1.0;
		} else {
			scores.push(ls_ratio)
		}
			
	}

	scores.iter().fold(0.0f32, |max, &val| max.max(val))
}


fn intersection_count_sorted_vec(a: &Vec<&str>, b: &mut Vec<&str> ) -> (String, String, String) {
	let mut sorted_sect: Vec<&str> = vec![];
	let mut sorted_1to2: Vec<&str> = vec![];

	for current_a in a {
		match b.binary_search(&current_a) {
			Ok(index) => {
				b.remove(index);
				sorted_sect.push(current_a)
			},
			_ => sorted_1to2.push(current_a),
		}
	}
    (sorted_sect.join(" "), sorted_1to2.join(" "), b.join(" "))
}


fn token_ratio(query: &str, choice: &str) -> f32 {
	let mut query_tokens: Vec<_> = query.split_whitespace().collect();
	query_tokens.sort_unstable();
	let mut choice_tokens: Vec<_> = choice.split_whitespace().collect();
	choice_tokens.sort_unstable();
	
	let sorted_query = query_tokens.join(" ");
	let sorted_choice = choice_tokens.join(" ");

	query_tokens.dedup();
	choice_tokens.dedup();

	let (sorted_sect, sorted_1to2, sorted_2to1) = intersection_count_sorted_vec(&query_tokens, &mut choice_tokens);

	if sorted_1to2.is_empty() || sorted_2to1.is_empty() {
		return 1.0;
	}

	let double_prefix = 2 * sorted_sect.chars().count();
	let mut len_1to2 = sorted_1to2.chars().count();
	let mut len_2to1 = sorted_2to1.chars().count();
	if !sorted_sect.is_empty() {
		len_1to2 += 1;
		len_2to1 += 1;
	}

	normalized_weighted_levenshtein(&sorted_query, &sorted_choice)
		.max(1.0 - len_1to2 as f32 / (len_1to2 + double_prefix) as f32)
		.max(1.0 - len_2to1 as f32 / (len_2to1 + double_prefix) as f32)
		.max(1.0 - weighted_levenshtein(&sorted_2to1, &sorted_1to2) as f32 / (len_1to2 + len_2to1 + double_prefix) as f32)
}


fn partial_token_ratio(query: &str, choice: &str) -> f32 {
	let mut query_tokens: Vec<_> = query.split_whitespace().collect();
	query_tokens.sort_unstable();
	let mut choice_tokens: Vec<_> = choice.split_whitespace().collect();
	choice_tokens.sort_unstable();
	
	let sorted_query = query_tokens.join(" ");
	let sorted_choice = choice_tokens.join(" ");

	query_tokens.dedup();
	choice_tokens.dedup();

	let (sorted_sect, mut sorted_1to2, mut sorted_2to1) = intersection_count_sorted_vec(&query_tokens, &mut choice_tokens);

	// TODO can probably be optimised in a similar way to token_ratio
	if sorted_sect.is_empty() == false {
		if sorted_1to2.is_empty() == false {
			sorted_1to2.insert_str(0, " ");
		}
		if sorted_2to1.is_empty() == false {
			sorted_2to1.insert_str(0, " ");
		}
	}
	let combined_1to2 = String::from(&sorted_sect) + &sorted_1to2;
	let combined_2to1 = String::from(&sorted_sect) + &sorted_2to1;
	partial_string_ratio(&sorted_query, &sorted_choice)
		.max(partial_string_ratio(&sorted_sect, &combined_1to2))
		.max(partial_string_ratio(&sorted_sect, &combined_2to1))
		.max(partial_string_ratio(&combined_1to2, &combined_2to1))
}