use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use serde::{Deserialize, Serialize};
use crate::process::process_matcher::{
ProcessType, ProcessTypeBitNode, ProcessedTextSet, build_process_type_tree,
reduce_text_process_with_tree,
};
use crate::regex_matcher::{RegexMatchType, RegexMatcher, RegexResult, RegexTable};
use crate::sim_matcher::{SimMatchType, SimMatcher, SimResult, SimTable};
use crate::simple_matcher::{SimpleMatcher, SimpleTable};
#[diagnostic::on_unimplemented(
message = "`{Self}` does not implement text matching",
label = "this type cannot be used as a matcher",
note = "implement `TextMatcherTrait` or use one of the built-in matchers: `SimpleMatcher`, `RegexMatcher`, `SimMatcher`, or `Matcher`"
)]
pub trait TextMatcherTrait<'a, T: MatchResultTrait<'a> + 'a> {
fn is_match(&'a self, text: &'a str) -> bool {
self.process_iter(text).next().is_some()
}
fn process(&'a self, text: &'a str) -> Vec<T> {
self.process_iter(text).collect()
}
fn process_iter(&'a self, text: &'a str) -> impl Iterator<Item = T> + 'a;
}
pub(crate) trait TextMatcherInternal<'a, T: MatchResultTrait<'a> + 'a> {
fn is_match_preprocessed(
&'a self,
processed_text_process_type_set: &ProcessedTextSet<'a>,
) -> bool;
fn process_preprocessed(
&'a self,
processed_text_process_type_set: &ProcessedTextSet<'a>,
) -> Vec<T>;
}
#[diagnostic::on_unimplemented(
message = "`{Self}` does not implement `MatchResultTrait`",
label = "this type cannot be used as a match result",
note = "implement `MatchResultTrait` with `match_id`, `table_id`, `word_id`, `word`, and `similarity` methods"
)]
pub trait MatchResultTrait<'a> {
fn match_id(&self) -> u32;
fn table_id(&self) -> u32;
fn word_id(&self) -> u32;
fn word(&self) -> &str;
fn similarity(&self) -> Option<f64>;
}
#[derive(Serialize, Deserialize, Clone, Copy, Debug, PartialEq)]
#[serde(rename_all = "snake_case")]
pub enum MatchTableType {
Simple {
process_type: ProcessType,
},
Regex {
regex_match_type: RegexMatchType,
process_type: ProcessType,
},
Similar {
sim_match_type: SimMatchType,
threshold: f64,
process_type: ProcessType,
},
}
pub trait MatchTableTrait<S: AsRef<str>> {
fn table_id(&self) -> u32;
fn match_table_type(&self) -> MatchTableType;
fn word_list(&self) -> &[S];
fn exemption_process_type(&self) -> ProcessType;
fn exemption_word_list(&self) -> &[S];
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct MatchTable<'a> {
pub table_id: u32,
pub match_table_type: MatchTableType,
#[serde(borrow)]
pub word_list: Vec<&'a str>,
pub exemption_process_type: ProcessType,
#[serde(borrow)]
pub exemption_word_list: Vec<&'a str>,
}
impl<'a> MatchTableTrait<&'a str> for MatchTable<'a> {
fn table_id(&self) -> u32 {
self.table_id
}
fn match_table_type(&self) -> MatchTableType {
self.match_table_type
}
fn word_list(&self) -> &[&'a str] {
&self.word_list
}
fn exemption_process_type(&self) -> ProcessType {
self.exemption_process_type
}
fn exemption_word_list(&self) -> &[&'a str] {
&self.exemption_word_list
}
}
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct MatchTableSerde<'a> {
pub table_id: u32,
pub match_table_type: MatchTableType,
#[serde(borrow)]
pub word_list: Vec<Cow<'a, str>>,
pub exemption_process_type: ProcessType,
#[serde(borrow)]
pub exemption_word_list: Vec<Cow<'a, str>>,
}
impl<'a> MatchTableTrait<Cow<'a, str>> for MatchTableSerde<'a> {
fn table_id(&self) -> u32 {
self.table_id
}
fn match_table_type(&self) -> MatchTableType {
self.match_table_type
}
fn word_list(&self) -> &[Cow<'a, str>] {
&self.word_list
}
fn exemption_process_type(&self) -> ProcessType {
self.exemption_process_type
}
fn exemption_word_list(&self) -> &[Cow<'a, str>] {
&self.exemption_word_list
}
}
#[derive(Debug, Clone)]
struct WordTableConf {
match_id: u32,
table_id: u32,
offset: u32,
is_exemption: bool,
}
#[derive(Serialize, Debug)]
pub struct MatchResult<'a> {
pub match_id: u32,
pub table_id: u32,
pub word_id: u32,
pub word: Cow<'a, str>,
pub similarity: Option<f64>,
}
impl MatchResultTrait<'_> for MatchResult<'_> {
fn match_id(&self) -> u32 {
self.match_id
}
fn word_id(&self) -> u32 {
self.word_id
}
fn table_id(&self) -> u32 {
self.table_id
}
fn word(&self) -> &str {
self.word.as_ref()
}
fn similarity(&self) -> Option<f64> {
self.similarity
}
}
impl<'a, 'b: 'a> From<SimResult<'b>> for MatchResult<'a> {
fn from(sim_result: SimResult<'b>) -> Self {
MatchResult {
match_id: sim_result.match_id,
table_id: sim_result.table_id,
word_id: sim_result.word_id,
word: sim_result.word,
similarity: Some(sim_result.similarity),
}
}
}
impl<'a, 'b: 'a> From<RegexResult<'b>> for MatchResult<'a> {
fn from(regex_result: RegexResult<'b>) -> Self {
MatchResult {
match_id: regex_result.match_id,
table_id: regex_result.table_id,
word_id: regex_result.word_id,
word: regex_result.word,
similarity: None,
}
}
}
pub type MatchTableMap<'a> = HashMap<u32, Vec<MatchTable<'a>>>;
pub type MatchTableMapSerde<'a> = HashMap<u32, Vec<MatchTableSerde<'a>>>;
#[derive(Debug, Clone)]
pub struct Matcher {
process_type_tree: Box<[ProcessTypeBitNode]>,
simple_word_table_conf_list: Box<[WordTableConf]>,
simple_word_table_conf_index_list: Box<[usize]>,
simple_matcher: Option<SimpleMatcher>,
regex_matcher: Option<RegexMatcher>,
sim_matcher: Option<SimMatcher>,
}
impl Matcher {
pub fn new<S, M, T>(match_table_map: &HashMap<u32, Vec<M>, S>) -> Matcher
where
M: MatchTableTrait<T>,
T: AsRef<str>,
{
let mut process_type_set = HashSet::new();
let mut simple_word_id = 0;
let mut simple_word_table_conf_id = 0;
let mut simple_word_table_conf_list = Vec::new();
let mut simple_word_table_conf_index_list = Vec::new();
let mut simple_table: SimpleTable = HashMap::new();
let mut regex_table_list = Vec::new();
let mut sim_table_list = Vec::new();
for (&match_id, table_list) in match_table_map {
for table in table_list {
let table_id = table.table_id();
let match_table_type = table.match_table_type();
let word_list = table
.word_list()
.iter()
.map(|s| s.as_ref())
.collect::<Vec<&str>>();
let exemption_process_type = table.exemption_process_type();
let exemption_word_list = table
.exemption_word_list()
.iter()
.map(|s| s.as_ref())
.collect::<Vec<&str>>();
if !word_list.is_empty() {
match match_table_type {
MatchTableType::Simple { process_type } => {
process_type_set.insert(process_type.bits());
simple_word_table_conf_list.push(WordTableConf {
match_id,
table_id,
offset: simple_word_id,
is_exemption: false,
});
let simple_word_map = simple_table.entry(process_type).or_default();
for word in word_list {
simple_word_table_conf_index_list.push(simple_word_table_conf_id);
simple_word_map.insert(simple_word_id, word);
simple_word_id += 1;
}
simple_word_table_conf_id += 1
}
MatchTableType::Similar {
process_type,
sim_match_type,
threshold,
} => {
process_type_set.insert(process_type.bits());
sim_table_list.push(SimTable {
table_id,
match_id,
process_type,
sim_match_type,
word_list,
threshold,
})
}
MatchTableType::Regex {
process_type,
regex_match_type,
} => {
process_type_set.insert(process_type.bits());
regex_table_list.push(RegexTable {
table_id,
match_id,
process_type,
regex_match_type,
word_list,
})
}
}
}
if !exemption_word_list.is_empty() {
process_type_set.insert(exemption_process_type.bits());
simple_word_table_conf_list.push(WordTableConf {
match_id,
table_id,
offset: simple_word_id,
is_exemption: true,
});
let simple_word_map = simple_table.entry(exemption_process_type).or_default();
for exemption_word in exemption_word_list {
simple_word_table_conf_index_list.push(simple_word_table_conf_id);
simple_word_map.insert(simple_word_id, exemption_word);
simple_word_id += 1;
}
simple_word_table_conf_id += 1
}
}
}
let process_type_tree = build_process_type_tree(&process_type_set).into_boxed_slice();
Matcher {
process_type_tree,
simple_word_table_conf_list: simple_word_table_conf_list.into_boxed_slice(),
simple_word_table_conf_index_list: simple_word_table_conf_index_list.into_boxed_slice(),
simple_matcher: (!simple_table.is_empty()).then(|| SimpleMatcher::new(&simple_table)),
regex_matcher: (!regex_table_list.is_empty())
.then(|| RegexMatcher::new(®ex_table_list)),
sim_matcher: (!sim_table_list.is_empty()).then(|| SimMatcher::new(&sim_table_list)),
}
}
pub fn word_match<'a>(&'a self, text: &'a str) -> HashMap<u32, Vec<MatchResult<'a>>> {
if text.is_empty() {
return HashMap::new();
}
let processed_text_process_type_set =
reduce_text_process_with_tree(&self.process_type_tree, text);
self._word_match_with_processed_text_process_type_set(&processed_text_process_type_set)
}
fn _word_match_with_processed_text_process_type_set<'a>(
&'a self,
processed_text_process_type_set: &ProcessedTextSet<'a>,
) -> HashMap<u32, Vec<MatchResult<'a>>> {
let mut match_result_dict = HashMap::new();
let mut failed_match_table_id_set = HashSet::new();
if let Some(regex_matcher) = &self.regex_matcher {
for regex_result in regex_matcher.process_preprocessed(processed_text_process_type_set)
{
let result_list: &mut Vec<MatchResult> =
match_result_dict.entry(regex_result.match_id).or_default();
result_list.push(regex_result.into());
}
}
if let Some(sim_matcher) = &self.sim_matcher {
for sim_result in sim_matcher.process_preprocessed(processed_text_process_type_set) {
let result_list = match_result_dict.entry(sim_result.match_id).or_default();
result_list.push(sim_result.into());
}
}
if let Some(simple_matcher) = &self.simple_matcher {
for simple_result in
simple_matcher.process_preprocessed(processed_text_process_type_set)
{
let word_table_conf = self.simple_word_table_conf_list.get(
self.simple_word_table_conf_index_list[simple_result.word_id as usize],
).expect("simple_word_table_conf_index_list` is pre-populated guaranteeing index mapping corresponds directly to valid indices mapped within `simple_word_table_conf_list`.");
let match_table_id = ((word_table_conf.match_id as usize) << 32)
| (word_table_conf.table_id as usize);
if failed_match_table_id_set.contains(&match_table_id) {
continue;
}
let result_list = match_result_dict
.entry(word_table_conf.match_id)
.or_default();
if word_table_conf.is_exemption {
failed_match_table_id_set.insert(match_table_id);
result_list
.retain(|match_result| match_result.table_id != word_table_conf.table_id);
} else {
result_list.push(MatchResult {
match_id: word_table_conf.match_id,
table_id: word_table_conf.table_id,
word_id: simple_result.word_id - word_table_conf.offset,
word: simple_result.word,
similarity: None,
});
}
}
}
match_result_dict.retain(|_, match_result_list| !match_result_list.is_empty());
match_result_dict
}
}
impl<'a> TextMatcherTrait<'a, MatchResult<'a>> for Matcher {
fn is_match(&self, text: &str) -> bool {
let processed_text_process_type_set =
reduce_text_process_with_tree(&self.process_type_tree, text);
self.is_match_preprocessed(&processed_text_process_type_set)
}
fn process(&'a self, text: &'a str) -> Vec<MatchResult<'a>> {
let processed_text_process_type_set =
reduce_text_process_with_tree(&self.process_type_tree, text);
self.process_preprocessed(&processed_text_process_type_set)
}
fn process_iter(&'a self, text: &'a str) -> impl Iterator<Item = MatchResult<'a>> + 'a {
gen move {
if text.is_empty() {
return;
}
let processed_text_process_type_set =
reduce_text_process_with_tree(&self.process_type_tree, text);
let matches = self
._word_match_with_processed_text_process_type_set(&processed_text_process_type_set);
for value_list in matches.into_values() {
for match_result in value_list {
yield match_result;
}
}
}
}
}
impl<'a> TextMatcherInternal<'a, MatchResult<'a>> for Matcher {
fn is_match_preprocessed(
&'a self,
processed_text_process_type_set: &ProcessedTextSet<'a>,
) -> bool {
if self.simple_matcher.is_some() {
return !self
._word_match_with_processed_text_process_type_set(processed_text_process_type_set)
.is_empty();
}
if let Some(regex_matcher) = &self.regex_matcher
&& regex_matcher.is_match_preprocessed(processed_text_process_type_set)
{
return true;
}
if let Some(sim_matcher) = &self.sim_matcher
&& sim_matcher.is_match_preprocessed(processed_text_process_type_set)
{
return true;
}
false
}
fn process_preprocessed(
&'a self,
processed_text_process_type_set: &ProcessedTextSet<'a>,
) -> Vec<MatchResult<'a>> {
self._word_match_with_processed_text_process_type_set(processed_text_process_type_set)
.into_values()
.flatten()
.collect()
}
}