unobtanium_segmenter/segmentation/
decomposition_aho_corasick.rs1use aho_corasick::AhoCorasick;
2use aho_corasick::AhoCorasickBuilder;
3use aho_corasick::BuildError;
4use aho_corasick::MatchKind;
5
6use std::vec::IntoIter;
7
8use crate::segmentation::Segmenter;
9use crate::SegmentedToken;
10use crate::UseOrSubdivide;
11
12#[derive(Debug, Clone)]
20pub struct DecompositionAhoCorasick {
21 dictionary: AhoCorasick,
22}
23
24impl DecompositionAhoCorasick {
25 pub fn from_dictionary<I, P>(dict: I) -> Result<Self, BuildError>
31 where
32 I: IntoIterator<Item = P>,
33 P: AsRef<[u8]>,
34 {
35 let dict = AhoCorasickBuilder::new()
36 .match_kind(MatchKind::LeftmostLongest)
37 .build(dict)?;
38
39 Ok(Self::from_automaton(dict))
40 }
41
42 pub fn from_automaton(dictionary: AhoCorasick) -> Self {
47 Self { dictionary }
48 }
49}
50
51impl Segmenter for DecompositionAhoCorasick {
52 type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
53
54 fn subdivide<'a>(
55 &self,
56 token: SegmentedToken<'a>,
57 ) -> UseOrSubdivide<SegmentedToken<'a>, Self::SubdivisionIter<'a>> {
58 if token.is_known_word {
59 return UseOrSubdivide::Use(token);
60 }
61
62 let mut cuts = Vec::new();
63 let mut pos: usize = 0;
64
65 for match_ in self.dictionary.find_iter(token.text) {
66 if pos != match_.start() {
68 break;
69 }
70 cuts.push(match_.len());
71 pos = match_.end();
72 }
73
74 if pos == token.text.len() {
75 let mut subsegments = Vec::<SegmentedToken>::with_capacity(cuts.len() + 1);
76 let mut text = token.text;
77 for pos in cuts {
78 let (word, rest) = text.split_at(pos);
81 text = rest;
82 subsegments
83 .push(SegmentedToken::new_derived_from(word, &token).with_is_kown_word(true));
84 }
85 return UseOrSubdivide::Subdivide(subsegments.into_iter());
86 } else {
87 return UseOrSubdivide::Use(token);
89 }
90 }
91}
92
93#[cfg(test)]
94mod test {
95
96 use super::*;
97 use crate::initial_paragraph_splitter::InitialParagraphSplitter;
98 use crate::SubdivisionMap;
99
100 #[test]
101 fn test_decomposition_aho_corasick() {
102 let decomposer =
103 DecompositionAhoCorasick::from_dictionary(vec!["foo", "bar", "baz"]).unwrap();
104
105 let splitter = InitialParagraphSplitter::new("foobarbaz fooquux foo bazbaz");
106
107 let subsplitter = SubdivisionMap::new(splitter, |s| decomposer.subdivide(s));
108
109 let result: Vec<&str> = subsplitter.map(|s| s.text).collect();
110
111 let expected_result = vec![
112 "foo", "bar", "baz", " ", "fooquux", " ", "foo", " ", "baz", "baz",
113 ];
114
115 assert_eq!(result, expected_result);
116 }
117}