unobtanium_segmenter/segmentation/
decomposition_aho_corasick.rs

1use aho_corasick::AhoCorasick;
2use aho_corasick::AhoCorasickBuilder;
3use aho_corasick::BuildError;
4use aho_corasick::MatchKind;
5
6use std::vec::IntoIter;
7
8use crate::segmentation::Segmenter;
9use crate::SegmentedToken;
10use crate::UseOrSubdivide;
11
12/// Decompose compound words into their parts found in a given dictionary. Useful for small or on the fly generated dictionaries.
13///
14/// This implementation is based on the [aho_corasick] crate.
15///
16/// This will decompose only if a word can be fully resolved using the dictionary, any texts containing unknown "words" will be passed through as-is.
17///
18/// This is an adaption of the [`SplitCompoundWords` filter from tantivy](https://docs.rs/tantivy/latest/tantivy/tokenizer/struct.SplitCompoundWords.html) for this crate.
19#[derive(Debug, Clone)]
20pub struct DecompositionAhoCorasick {
21	dictionary: AhoCorasick,
22}
23
24impl DecompositionAhoCorasick {
25	/// Create a filter from a given dictionary.
26	///
27	/// The dictionary will be used to construct an [`AhoCorasick`] automaton
28	/// with reasonable defaults. See [`from_automaton`][Self::from_automaton] if
29	/// more control over its construction is required.
30	pub fn from_dictionary<I, P>(dict: I) -> Result<Self, BuildError>
31	where
32		I: IntoIterator<Item = P>,
33		P: AsRef<[u8]>,
34	{
35		let dict = AhoCorasickBuilder::new()
36			.match_kind(MatchKind::LeftmostLongest)
37			.build(dict)?;
38
39		Ok(Self::from_automaton(dict))
40	}
41
42	/// Create a filter from a given automaton.
43	///
44	/// The automaton should use one of the leftmost-first match kinds
45	/// and it should not be anchored.
46	pub fn from_automaton(dictionary: AhoCorasick) -> Self {
47		Self { dictionary }
48	}
49}
50
51impl Segmenter for DecompositionAhoCorasick {
52	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
53
54	fn subdivide<'a>(
55		&self,
56		token: SegmentedToken<'a>,
57	) -> UseOrSubdivide<SegmentedToken<'a>, Self::SubdivisionIter<'a>> {
58		if token.is_known_word {
59			return UseOrSubdivide::Use(token);
60		}
61
62		let mut cuts = Vec::new();
63		let mut pos: usize = 0;
64
65		for match_ in self.dictionary.find_iter(token.text) {
66			// abort if the matches have some space between them
67			if pos != match_.start() {
68				break;
69			}
70			cuts.push(match_.len());
71			pos = match_.end();
72		}
73
74		if pos == token.text.len() {
75			let mut subsegments = Vec::<SegmentedToken>::with_capacity(cuts.len() + 1);
76			let mut text = token.text;
77			for pos in cuts {
78				// Every time we split here `text` gets shorter, which is
79				// why we can use the pushed lengths as positions here.
80				let (word, rest) = text.split_at(pos);
81				text = rest;
82				subsegments
83					.push(SegmentedToken::new_derived_from(word, &token).with_is_kown_word(true));
84			}
85			return UseOrSubdivide::Subdivide(subsegments.into_iter());
86		} else {
87			// If we didn't end at the end the token couldn't be split into parts of the wordlist -> return the token whole and unchanged.
88			return UseOrSubdivide::Use(token);
89		}
90	}
91}
92
93#[cfg(test)]
94mod test {
95
96	use super::*;
97	use crate::initial_paragraph_splitter::InitialParagraphSplitter;
98	use crate::SubdivisionMap;
99
100	#[test]
101	fn test_decomposition_aho_corasick() {
102		let decomposer =
103			DecompositionAhoCorasick::from_dictionary(vec!["foo", "bar", "baz"]).unwrap();
104
105		let splitter = InitialParagraphSplitter::new("foobarbaz fooquux foo bazbaz");
106
107		let subsplitter = SubdivisionMap::new(splitter, |s| decomposer.subdivide(s));
108
109		let result: Vec<&str> = subsplitter.map(|s| s.text).collect();
110
111		let expected_result = vec![
112			"foo", "bar", "baz", " ", "fooquux", " ", "foo", " ", "baz", "baz",
113		];
114
115		assert_eq!(result, expected_result);
116	}
117}