use std::borrow::Cow;
use fancy_regex::Regex;
use rapidfuzz::distance;
use sonic_rs::{Deserialize, Serialize};
use crate::matcher::{MatchResultTrait, TextMatcherTrait};
#[cfg(feature = "serde")]
use crate::util::serde::serde_regex;
#[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum SimMatchType {
Levenshtein,
DamerauLevenshtein,
Indel,
Jaro,
JaroWinkler,
}
#[derive(Debug, Clone)]
pub struct SimTable<'a> {
pub table_id: u32,
pub match_id: u32,
pub sim_match_type: SimMatchType,
pub word_list: &'a Vec<&'a str>,
pub threshold: f64,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
struct SimProcessedTable {
table_id: u32,
match_id: u32,
sim_match_type: SimMatchType,
word_list: Vec<String>,
threshold: f64,
}
#[derive(Debug, Clone)]
pub struct SimResult<'a> {
pub match_id: u32,
pub table_id: u32,
pub word: Cow<'a, str>,
pub similarity: f64,
}
impl MatchResultTrait<'_> for SimResult<'_> {
fn table_id(&self) -> u32 {
self.table_id
}
fn word(&self) -> &str {
self.word.as_ref()
}
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct SimMatcher {
#[cfg_attr(feature = "serde", serde(with = "serde_regex"))]
remove_special_pattern: Regex,
sim_processed_table_list: Vec<SimProcessedTable>,
}
impl SimMatcher {
pub fn new(sim_table_list: &[SimTable]) -> SimMatcher {
SimMatcher {
remove_special_pattern: Regex::new(r"\W+").unwrap(),
sim_processed_table_list: sim_table_list
.iter()
.map(|sim_table| SimProcessedTable {
table_id: sim_table.table_id,
match_id: sim_table.match_id,
sim_match_type: sim_table.sim_match_type,
word_list: sim_table
.word_list
.iter()
.map(|&word| word.to_owned())
.collect::<Vec<String>>(),
threshold: sim_table.threshold,
})
.collect(),
}
}
}
impl<'a> TextMatcherTrait<'a, SimResult<'a>> for SimMatcher {
fn is_match(&self, text: &str) -> bool {
let processed_text = self.remove_special_pattern.replace_all(text, "");
self.sim_processed_table_list
.iter()
.any(|sim_table| match sim_table.sim_match_type {
SimMatchType::Levenshtein => sim_table.word_list.iter().any(|text| {
distance::levenshtein::normalized_similarity_with_args(
text.chars(),
processed_text.chars(),
&distance::levenshtein::Args::default().score_cutoff(sim_table.threshold),
)
.is_some()
}),
SimMatchType::DamerauLevenshtein => sim_table.word_list.iter().any(|text| {
distance::damerau_levenshtein::normalized_similarity_with_args(
text.chars(),
processed_text.chars(),
&distance::damerau_levenshtein::Args::default()
.score_cutoff(sim_table.threshold),
)
.is_some()
}),
SimMatchType::Indel => sim_table.word_list.iter().any(|text| {
distance::indel::normalized_similarity_with_args(
text.chars(),
processed_text.chars(),
&distance::indel::Args::default().score_cutoff(sim_table.threshold),
)
.is_some()
}),
SimMatchType::Jaro => sim_table.word_list.iter().any(|text| {
distance::jaro::normalized_similarity_with_args(
text.chars(),
processed_text.chars(),
&distance::jaro::Args::default().score_cutoff(sim_table.threshold),
)
.is_some()
}),
SimMatchType::JaroWinkler => sim_table.word_list.iter().any(|text| {
distance::jaro_winkler::normalized_similarity_with_args(
text.chars(),
processed_text.chars(),
&distance::jaro_winkler::Args::default().score_cutoff(sim_table.threshold),
)
.is_some()
}),
})
}
fn process(&'a self, text: &str) -> Vec<SimResult<'a>> {
let processed_text = self.remove_special_pattern.replace_all(text, "");
let mut result_list = Vec::new();
for sim_table in &self.sim_processed_table_list {
match sim_table.sim_match_type {
SimMatchType::Levenshtein => {
result_list.extend(sim_table.word_list.iter().filter_map(|text| {
distance::levenshtein::normalized_similarity_with_args(
text.chars(),
processed_text.chars(),
&distance::levenshtein::Args::default()
.score_cutoff(sim_table.threshold),
)
.map(|similarity| SimResult {
match_id: sim_table.match_id,
table_id: sim_table.table_id,
word: Cow::Borrowed(text),
similarity,
})
}))
}
SimMatchType::DamerauLevenshtein => {
result_list.extend(sim_table.word_list.iter().filter_map(|text| {
distance::damerau_levenshtein::normalized_similarity_with_args(
text.chars(),
processed_text.chars(),
&distance::damerau_levenshtein::Args::default()
.score_cutoff(sim_table.threshold),
)
.map(|similarity| SimResult {
match_id: sim_table.match_id,
table_id: sim_table.table_id,
word: Cow::Borrowed(text),
similarity,
})
}))
}
SimMatchType::Indel => {
result_list.extend(sim_table.word_list.iter().filter_map(|text| {
distance::indel::normalized_similarity_with_args(
text.chars(),
processed_text.chars(),
&distance::indel::Args::default().score_cutoff(sim_table.threshold),
)
.map(|similarity| SimResult {
match_id: sim_table.match_id,
table_id: sim_table.table_id,
word: Cow::Borrowed(text),
similarity,
})
}))
}
SimMatchType::Jaro => {
result_list.extend(sim_table.word_list.iter().filter_map(|text| {
distance::jaro::normalized_similarity_with_args(
text.chars(),
processed_text.chars(),
&distance::jaro::Args::default().score_cutoff(sim_table.threshold),
)
.map(|similarity| SimResult {
match_id: sim_table.match_id,
table_id: sim_table.table_id,
word: Cow::Borrowed(text),
similarity,
})
}))
}
SimMatchType::JaroWinkler => {
result_list.extend(sim_table.word_list.iter().filter_map(|text| {
distance::jaro_winkler::normalized_similarity_with_args(
text.chars(),
processed_text.chars(),
&distance::jaro_winkler::Args::default()
.score_cutoff(sim_table.threshold),
)
.map(|similarity| SimResult {
match_id: sim_table.match_id,
table_id: sim_table.table_id,
word: Cow::Borrowed(text),
similarity,
})
}))
}
}
}
result_list
}
}