use aho_corasick::AhoCorasick;
use aho_corasick::AhoCorasickBuilder;
use aho_corasick::BuildError;
use aho_corasick::MatchKind;
use std::vec::IntoIter;
use crate::SegmentedToken;
use crate::UseOrSubdivide;
use crate::segmentation::Segmenter;
#[derive(Debug, Clone)]
pub struct DecompositionAhoCorasick {
dictionary: AhoCorasick,
}
impl DecompositionAhoCorasick {
pub fn from_dictionary<I, P>(dict: I) -> Result<Self, BuildError>
where
I: IntoIterator<Item = P>,
P: AsRef<[u8]>,
{
let dict = AhoCorasickBuilder::new()
.match_kind(MatchKind::LeftmostLongest)
.build(dict)?;
Ok(Self::from_automaton(dict))
}
pub fn from_automaton(dictionary: AhoCorasick) -> Self {
Self { dictionary }
}
}
impl Segmenter for DecompositionAhoCorasick {
type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
fn subdivide<'a>(
&self,
token: SegmentedToken<'a>,
) -> UseOrSubdivide<SegmentedToken<'a>, Self::SubdivisionIter<'a>> {
if token.is_known_word {
return UseOrSubdivide::Use(token);
}
let mut cuts = Vec::new();
let mut pos: usize = 0;
for match_ in self.dictionary.find_iter(token.text) {
if pos != match_.start() {
break;
}
cuts.push(match_.len());
pos = match_.end();
}
if pos == token.text.len() {
let mut subsegments = Vec::<SegmentedToken>::with_capacity(cuts.len() + 1);
let mut text = token.text;
for pos in cuts {
let (word, rest) = text.split_at(pos);
text = rest;
subsegments
.push(SegmentedToken::new_derived_from(word, &token).with_is_kown_word(true));
}
return UseOrSubdivide::Subdivide(subsegments.into_iter());
} else {
return UseOrSubdivide::Use(token);
}
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::SubdivisionMap;
use crate::initial_paragraph_splitter::InitialParagraphSplitter;
#[test]
fn test_decomposition_aho_corasick() {
let decomposer =
DecompositionAhoCorasick::from_dictionary(vec!["foo", "bar", "baz"]).unwrap();
let splitter = InitialParagraphSplitter::new("foobarbaz fooquux foo bazbaz");
let subsplitter = SubdivisionMap::new(splitter, |s| decomposer.subdivide(s));
let result: Vec<&str> = subsplitter.map(|s| s.text).collect();
let expected_result = vec![
"foo", "bar", "baz", " ", "fooquux", " ", "foo", " ", "baz", "baz",
];
assert_eq!(result, expected_result);
}
}