unobtanium_segmenter/segmentation/
decomposition_fst.rs

1use fst::raw::Fst;
2use fst::raw::Output;
3use fst::Error;
4
5use std::vec::IntoIter;
6
7use crate::segmentation::Segmenter;
8use crate::SegmentedToken;
9use crate::UseOrSubdivide;
10
11/// Decompose compound words into their parts found in a given dictionary. Useful for compressed, memory mapped dictionaries.
12///
13/// This implementation is based on the [fst] crate.
14///
15/// This will decompose only if a word can be fully resolved using the dictionary, any texts containing unknown "words" will be passed through as-is.
16///
17/// This is an adaption of the [`FstSegmenter` from charabia](https://github.com/meilisearch/charabia/blob/main/charabia/src/segmenter/utils.rs) for this crate. Unlike the charabia crate, this does not do splitting into unknown segments using heuristics or character splitting.
18#[derive(Clone)]
19pub struct DecompositionFst<D: AsRef<[u8]>> {
20	dictionary: Fst<D>,
21}
22
23impl DecompositionFst<Vec<u8>> {
24	/// Convenience contructor for `DecompositionFst`,
25	/// takes a list of lexicographically ordered words
26	/// to recognize as valid parts for decomposition.
27	///
28	/// If you're using this constructor outside of testing
29	/// and development please have a look at the [DecompositionAhoCorasick]
30	/// implementation as it is more likely to fit your usecase
31	/// of an in-memory automaton matcher.
32	///
33	/// [DecompositionAhoCorasick]: crate::segmentation::DecompositionAhoCorasick
34	///
35	pub fn from_dictionary<I, P>(dict: I) -> Result<Self, Error>
36	where
37		I: IntoIterator<Item = P>,
38		P: AsRef<[u8]>,
39	{
40		let fst = Fst::from_iter_set(dict)?;
41
42		Ok(DecompositionFst::from_fst(fst))
43	}
44}
45
46impl<D> DecompositionFst<D>
47where
48	D: AsRef<[u8]>,
49{
50	/// Construct this decomposer from an existing [Fst] struct.
51	///
52	/// This is the recommended way as it allows you to choose the best
53	/// datastore that fits you usecase.
54	pub fn from_fst(dictionary: Fst<D>) -> Self {
55		DecompositionFst { dictionary }
56	}
57}
58
59impl<D> Segmenter for DecompositionFst<D>
60where
61	D: AsRef<[u8]>,
62{
63	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
64
65	fn subdivide<'a>(
66		&self,
67		token: SegmentedToken<'a>,
68	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
69		if token.is_known_word {
70			return UseOrSubdivide::Use(token);
71		}
72		let mut cuts = Vec::new();
73		let mut pos: usize = 0;
74
75		while let Some((_, length)) =
76			find_longest_prefix(&self.dictionary, &token.text.as_bytes()[pos..])
77		{
78			// abort if the matches have some space between them
79			cuts.push(length);
80			pos += length;
81			if !token.text.is_char_boundary(pos) && pos != token.text.len() {
82				eprintln!(
83					"Detected invalid utf8 in dictionary, not subdividing token: {:?}",
84					token.text
85				);
86				return UseOrSubdivide::Use(token);
87			}
88		}
89
90		if pos == token.text.len() {
91			let mut subsegments = Vec::<SegmentedToken>::with_capacity(cuts.len() + 1);
92			let mut text = token.text;
93			for pos in cuts {
94				// Every time we split here `text` gets shorter, which is
95				// why we can use the pushed lengths as positions here.
96				let (word, rest) = text.split_at(pos);
97				text = rest;
98				subsegments
99					.push(SegmentedToken::new_derived_from(word, &token).with_is_kown_word(true));
100			}
101			return UseOrSubdivide::Subdivide(subsegments.into_iter());
102		} else {
103			// If we didn't end at the end the token couldn't be split into parts of the wordlist -> return the token whole and unchanged.
104			return UseOrSubdivide::Use(token);
105		}
106	}
107}
108
109/// find the longest key that is prefix of the given value.
110///
111/// This was taken unmodified from charabia at 2025-05-03 commit `8523fa8`.
112#[inline]
113fn find_longest_prefix<D>(fst: &Fst<D>, value: &[u8]) -> Option<(u64, usize)>
114where
115	D: AsRef<[u8]>,
116{
117	let mut node = fst.root();
118	let mut out = Output::zero();
119	let mut last_match = None;
120	for (i, &b) in value.iter().enumerate() {
121		if let Some(trans_index) = node.find_input(b) {
122			let t = node.transition(trans_index);
123			node = fst.node(t.addr);
124			out = out.cat(t.out);
125			if node.is_final() {
126				last_match = Some((out.cat(node.final_output()).value(), i + 1));
127			}
128		} else {
129			return last_match;
130		}
131	}
132	last_match
133}
134
135#[cfg(test)]
136mod test {
137
138	use super::*;
139	use crate::initial_paragraph_splitter::InitialParagraphSplitter;
140	use crate::SubdivisionMap;
141
142	#[test]
143	fn test_decomposition_fst() {
144		let decomposer = DecompositionFst::from_dictionary(vec!["bar", "baz", "foo"]).unwrap();
145
146		let splitter = InitialParagraphSplitter::new("foobarbaz fooquux foo bazbaz");
147
148		let subsplitter = SubdivisionMap::new(splitter, |s| decomposer.subdivide(s));
149
150		let result: Vec<&str> = subsplitter.map(|s| s.text).collect();
151
152		let expected_result = vec![
153			"foo", "bar", "baz", " ", "fooquux", " ", "foo", " ", "baz", "baz",
154		];
155
156		assert_eq!(result, expected_result);
157	}
158}