use std::borrow::Cow;
use std::collections::HashMap;
use nohash_hasher::{IntMap, IntSet};
use sonic_rs::{to_string, Deserialize, Serialize};
use crate::regex_matcher::{RegexMatcher, RegexTable};
use crate::sim_matcher::{SimMatcher, SimTable};
use crate::simple_matcher::{SimpleMatchType, SimpleMatcher};
pub trait TextMatcherTrait<'a, T: MatchResultTrait<'a>> {
fn is_match(&self, text: &str) -> bool;
fn process(&'a self, text: &str) -> Vec<T>;
fn batch_process(&'a self, text_array: &[&str]) -> Vec<Vec<T>> {
text_array.iter().map(|&text| self.process(text)).collect()
}
}
pub trait MatchResultTrait<'a> {
fn word_id(&self) -> u64 {
0
}
fn table_id(&self) -> u64 {
0
}
fn word(&self) -> &str;
}
#[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum MatchTableType {
Simple,
SimilarChar,
Acrostic,
SimilarTextLevenshtein,
Regex,
}
#[derive(Serialize, Deserialize, Clone)]
pub struct MatchTable<'a> {
pub table_id: u64,
pub match_table_type: MatchTableType,
pub simple_match_type: SimpleMatchType,
#[serde(borrow)]
pub word_list: Vec<&'a str>,
pub exemption_simple_match_type: SimpleMatchType,
#[serde(borrow)]
pub exemption_word_list: Vec<&'a str>,
}
#[derive(Debug, Clone)]
struct WordTableConf {
match_id: u64,
table_id: u64,
is_exemption: bool,
}
#[derive(Serialize)]
pub struct MatchResult<'a> {
pub table_id: u64,
pub word: Cow<'a, str>,
}
impl MatchResultTrait<'_> for MatchResult<'_> {
fn word_id(&self) -> u64 {
0
}
fn table_id(&self) -> u64 {
self.table_id
}
fn word(&self) -> &str {
self.word.as_ref()
}
}
pub type MatchTableMap<'a> = IntMap<u64, Vec<MatchTable<'a>>>;
#[derive(Debug, Clone)]
pub struct Matcher {
simple_word_table_conf_map: IntMap<u64, WordTableConf>,
simple_word_table_conf_id_map: IntMap<u64, u64>,
simple_matcher: Option<SimpleMatcher>,
regex_matcher: Option<RegexMatcher>,
sim_matcher: Option<SimMatcher>,
}
impl Matcher {
pub fn new<'a, I, M>(match_table_map: I) -> Matcher
where
I: IntoIterator<Item = (u64, M)>,
M: IntoIterator<Item = MatchTable<'a>>,
{
let mut word_id: u64 = 0;
let mut word_table_conf_id: u64 = 0;
let mut simple_word_table_conf_map = IntMap::default();
let mut simple_word_table_conf_id_map = IntMap::default();
let mut simple_match_type_word_map: IntMap<SimpleMatchType, IntMap<u64, &'a str>> =
IntMap::default();
let mut regex_table_list: Vec<RegexTable> = Vec::new();
let mut sim_table_list: Vec<SimTable> = Vec::new();
for (match_id, table_list) in match_table_map.into_iter() {
for table in table_list.into_iter() {
let table_id = table.table_id;
let match_table_type = table.match_table_type;
let word_list = table.word_list;
let exemption_word_list = &table.exemption_word_list;
if !word_list.is_empty() {
match match_table_type {
MatchTableType::Simple => {
simple_word_table_conf_map.insert(
word_table_conf_id,
WordTableConf {
match_id,
table_id,
is_exemption: false,
},
);
let simple_word_map = simple_match_type_word_map
.entry(table.simple_match_type)
.or_default();
for word in word_list.iter() {
simple_word_table_conf_id_map.insert(word_id, word_table_conf_id);
simple_word_map.insert(word_id, word);
word_id += 1;
}
word_table_conf_id += 1
}
MatchTableType::SimilarTextLevenshtein => sim_table_list.push(SimTable {
table_id,
match_id,
word_list,
}),
_ => regex_table_list.push(RegexTable {
table_id,
match_id,
match_table_type,
word_list,
}),
}
}
if !exemption_word_list.is_empty() {
simple_word_table_conf_map.insert(
word_table_conf_id,
WordTableConf {
match_id,
table_id,
is_exemption: true,
},
);
let simple_word_map = simple_match_type_word_map
.entry(table.exemption_simple_match_type)
.or_default();
for exemption_word in exemption_word_list.iter() {
simple_word_table_conf_id_map.insert(word_id, word_table_conf_id);
simple_word_map.insert(word_id, exemption_word);
word_id += 1;
}
word_table_conf_id += 1
}
}
}
Matcher {
simple_word_table_conf_map,
simple_word_table_conf_id_map,
simple_matcher: (!simple_match_type_word_map.is_empty())
.then(|| SimpleMatcher::new(simple_match_type_word_map)),
regex_matcher: (!regex_table_list.is_empty())
.then(|| RegexMatcher::new(®ex_table_list)),
sim_matcher: (!sim_table_list.is_empty()).then(|| SimMatcher::new(&sim_table_list)),
}
}
pub fn word_match_raw(&self, text: &str) -> HashMap<u64, Vec<MatchResult>> {
if !text.is_empty() {
let mut match_result_dict = HashMap::default();
let mut failed_match_id_set = IntSet::default();
if let Some(regex_matcher) = &self.regex_matcher {
for regex_result in regex_matcher.process(text) {
let result_list = match_result_dict
.entry(regex_result.match_id)
.or_insert(Vec::new());
result_list.push(MatchResult {
table_id: regex_result.table_id,
word: regex_result.word,
})
}
}
if let Some(sim_matcher) = &self.sim_matcher {
for sim_result in sim_matcher.process(text) {
let result_list = match_result_dict
.entry(sim_result.match_id)
.or_insert(Vec::new());
result_list.push(MatchResult {
table_id: sim_result.table_id,
word: sim_result.word,
})
}
}
if let Some(simple_matcher) = &self.simple_matcher {
for simple_result in simple_matcher.process(text) {
let word_table_conf = unsafe {
self.simple_word_table_conf_map
.get(
self.simple_word_table_conf_id_map
.get(&simple_result.word_id)
.unwrap_unchecked(),
)
.unwrap_unchecked()
};
if word_table_conf.is_exemption {
failed_match_id_set.insert(word_table_conf.match_id);
match_result_dict.remove(&word_table_conf.match_id);
}
if failed_match_id_set.contains(&word_table_conf.match_id) {
continue;
}
let result_list = match_result_dict
.entry(word_table_conf.match_id)
.or_insert(Vec::new());
result_list.push(MatchResult {
table_id: word_table_conf.table_id,
word: simple_result.word,
})
}
}
match_result_dict
} else {
HashMap::default()
}
}
pub fn word_match(&self, text: &str) -> HashMap<u64, String> {
self.word_match_raw(text)
.into_iter()
.map(|(match_id, result_list)| {
(match_id, unsafe {
to_string(&result_list).unwrap_unchecked()
})
})
.collect()
}
pub fn word_match_as_string(&self, text: &str) -> String {
unsafe { to_string(&self.word_match(text)).unwrap_unchecked() }
}
}
impl<'a> TextMatcherTrait<'a, MatchResult<'a>> for Matcher {
fn is_match(&self, text: &str) -> bool {
!self.word_match_raw(text).is_empty()
}
fn process(&'a self, text: &str) -> Vec<MatchResult<'a>> {
self.word_match_raw(text)
.into_iter()
.flat_map(|(_, result_list)| result_list) .collect()
}
}