use crate::error::Result;
use crate::lexical::index::inverted::core::terms::{TermStats, TermsEnum};
use crate::util::levenshtein::{damerau_levenshtein_distance, levenshtein_distance};
use regex::Regex;
pub trait Automaton: Send + Sync {
fn matches(&self, candidate: &str) -> bool;
fn initial_seek_term(&self) -> Option<String>;
}
#[derive(Debug, Clone)]
pub struct LevenshteinAutomaton {
pattern: String,
max_edits: u32,
prefix_length: usize,
transpositions: bool,
}
impl LevenshteinAutomaton {
pub fn new(
pattern: impl Into<String>,
max_edits: u32,
prefix_length: usize,
transpositions: bool,
) -> Self {
LevenshteinAutomaton {
pattern: pattern.into(),
max_edits,
prefix_length,
transpositions,
}
}
pub fn pattern(&self) -> &str {
&self.pattern
}
pub fn max_edits(&self) -> u32 {
self.max_edits
}
pub fn prefix_length(&self) -> usize {
self.prefix_length
}
pub fn uses_transpositions(&self) -> bool {
self.transpositions
}
}
impl Automaton for LevenshteinAutomaton {
fn matches(&self, candidate: &str) -> bool {
if self.prefix_length > 0 {
let pattern_prefix: String = self.pattern.chars().take(self.prefix_length).collect();
if !candidate.starts_with(&pattern_prefix) {
return false;
}
}
let distance = if self.transpositions {
damerau_levenshtein_distance(&self.pattern, candidate) as u32
} else {
levenshtein_distance(&self.pattern, candidate) as u32
};
distance <= self.max_edits
}
fn initial_seek_term(&self) -> Option<String> {
if self.prefix_length > 0 {
Some(self.pattern.chars().take(self.prefix_length).collect())
} else {
None
}
}
}
#[derive(Debug, Clone)]
pub struct RegexAutomaton {
regex: Regex,
#[allow(dead_code)]
pattern: String,
initial_seek_term: Option<String>,
}
impl RegexAutomaton {
pub fn new(pattern: &str) -> Result<Self> {
let regex = Regex::new(pattern).map_err(|e| {
crate::error::LaurusError::analysis(format!("Invalid regexp pattern: {e}"))
})?;
let initial_seek_term = Self::extract_prefix(pattern);
Ok(RegexAutomaton {
regex,
pattern: pattern.to_string(),
initial_seek_term,
})
}
pub fn from_regex(regex: Regex, pattern: String) -> Self {
let initial_seek_term = Self::extract_prefix(&pattern);
RegexAutomaton {
regex,
pattern,
initial_seek_term,
}
}
fn extract_prefix(pattern: &str) -> Option<String> {
let mut chars = pattern.chars();
if chars.next() != Some('^') {
return None;
}
let mut prefix = String::new();
let mut escaped = false;
for c in chars {
if escaped {
prefix.push(c);
escaped = false;
} else {
match c {
'\\' => escaped = true,
'.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$' => {
break;
}
_ => prefix.push(c),
}
}
}
if prefix.is_empty() {
None
} else {
Some(prefix)
}
}
}
impl Automaton for RegexAutomaton {
fn matches(&self, candidate: &str) -> bool {
self.regex.is_match(candidate)
}
fn initial_seek_term(&self) -> Option<String> {
self.initial_seek_term.clone()
}
}
pub struct AutomatonTermsEnum<T: TermsEnum, A: Automaton> {
inner: T,
automaton: A,
max_matches: Option<usize>,
matches_found: usize,
}
impl<T: TermsEnum, A: Automaton> AutomatonTermsEnum<T, A> {
pub fn new(inner: T, automaton: A) -> Self {
AutomatonTermsEnum {
inner,
automaton,
max_matches: None,
matches_found: 0,
}
}
pub fn with_max_matches(mut self, max_matches: usize) -> Self {
self.max_matches = Some(max_matches);
self
}
pub fn automaton(&self) -> &A {
&self.automaton
}
}
impl<T: TermsEnum, A: Automaton> TermsEnum for AutomatonTermsEnum<T, A> {
fn next(&mut self) -> Result<Option<TermStats>> {
if let Some(max) = self.max_matches
&& self.matches_found >= max
{
return Ok(None);
}
if self.matches_found == 0
&& let Some(seek_term) = self.automaton.initial_seek_term()
{
let _ = self.inner.seek(&seek_term)?;
}
while let Some(term_stats) = self.inner.next()? {
if self.automaton.matches(&term_stats.term) {
self.matches_found += 1;
return Ok(Some(term_stats));
}
if let Some(prefix) = self.automaton.initial_seek_term()
&& !term_stats.term.starts_with(&prefix)
{
return Ok(None);
}
}
Ok(None)
}
fn seek(&mut self, target: &str) -> Result<bool> {
self.inner.seek(target)
}
fn seek_exact(&mut self, term: &str) -> Result<bool> {
self.inner.seek_exact(term)
}
fn current(&self) -> Option<&TermStats> {
self.inner.current()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_levenshtein_automaton() {
let automaton = LevenshteinAutomaton::new("hello", 1, 0, true);
assert!(automaton.matches("hello")); assert!(automaton.matches("helo")); assert!(automaton.matches("hallo")); assert!(automaton.matches("helllo")); assert!(automaton.matches("ehllo"));
assert!(!automaton.matches("world")); assert!(!automaton.matches("hi")); }
#[test]
fn test_prefix_constraint() {
let automaton = LevenshteinAutomaton::new("hello", 2, 2, true);
assert!(automaton.matches("hello")); assert!(automaton.matches("heLLo"));
assert!(!automaton.matches("xello")); assert!(!automaton.matches("world")); }
}