use crate::recognizers::Recognizer;
use crate::types::{Detection, DetectionExplanation, EntityType, NlpArtifacts};
use aho_corasick::{AhoCorasick, AhoCorasickBuilder, MatchKind};
#[derive(Clone)]
pub struct DictionaryRecognizer {
name: String,
entity: EntityType,
matcher: AhoCorasick,
score: f32,
source: String,
}
impl DictionaryRecognizer {
pub fn new(
name: impl Into<String>,
entity: EntityType,
terms: &[String],
score: f32,
source: impl Into<String>,
case_insensitive: bool,
) -> Self {
let matcher = AhoCorasickBuilder::new()
.ascii_case_insensitive(case_insensitive)
.match_kind(MatchKind::LeftmostLongest)
.build(terms)
.expect("failed to build dictionary matcher");
Self {
name: name.into(),
entity,
matcher,
score,
source: source.into(),
}
}
}
impl Recognizer for DictionaryRecognizer {
fn name(&self) -> &str {
&self.name
}
fn supported_entities(&self) -> &[EntityType] {
std::slice::from_ref(&self.entity)
}
fn analyze(&self, text: &str, _artifacts: &NlpArtifacts) -> Vec<Detection> {
self.matcher
.find_iter(text)
.map(|m| Detection {
entity_type: self.entity.clone(),
start: m.start(),
end: m.end(),
score: self.score,
recognizer: self.name.clone(),
explanation: DetectionExplanation::Dictionary {
source: self.source.clone(),
},
})
.collect()
}
}