unobtanium_segmenter/segmentation/
decomposition_fst.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use fst::Error;
6use fst::raw::Fst;
7use fst::raw::Output;
8
9use std::vec::IntoIter;
10
11use crate::SegmentedToken;
12use crate::UseOrSubdivide;
13use crate::segmentation::Segmenter;
14
15/// Decompose compound words into their parts found in a given dictionary. Useful for compressed, memory mapped dictionaries.
16///
17/// This implementation is based on the [fst] crate.
18///
19/// This will decompose only if a word can be fully resolved using the dictionary, any texts containing unknown "words" will be passed through as-is.
20///
21/// This is an adaption of the [`FstSegmenter` from charabia](https://github.com/meilisearch/charabia/blob/main/charabia/src/segmenter/utils.rs) for this crate. Unlike the charabia crate, this does not do splitting into unknown segments using heuristics or character splitting.
22#[derive(Clone)]
23pub struct DecompositionFst<D: AsRef<[u8]>> {
24	dictionary: Fst<D>,
25}
26
27impl DecompositionFst<Vec<u8>> {
28	/// Convenience contructor for `DecompositionFst`,
29	/// takes a list of lexicographically ordered words
30	/// to recognize as valid parts for decomposition.
31	///
32	/// If you're using this constructor outside of testing
33	/// and development please have a look at the [DecompositionAhoCorasick]
34	/// implementation as it is more likely to fit your usecase
35	/// of an in-memory automaton matcher.
36	///
37	/// [DecompositionAhoCorasick]: crate::segmentation::DecompositionAhoCorasick
38	///
39	pub fn from_dictionary<I, P>(dict: I) -> Result<Self, Error>
40	where
41		I: IntoIterator<Item = P>,
42		P: AsRef<[u8]>,
43	{
44		let fst = Fst::from_iter_set(dict)?;
45
46		Ok(DecompositionFst::from_fst(fst))
47	}
48}
49
50impl<D> DecompositionFst<D>
51where
52	D: AsRef<[u8]>,
53{
54	/// Construct this decomposer from an existing [Fst] struct.
55	///
56	/// This is the recommended way as it allows you to choose the best
57	/// datastore that fits you usecase.
58	pub fn from_fst(dictionary: Fst<D>) -> Self {
59		DecompositionFst { dictionary }
60	}
61}
62
63impl<D> Segmenter for DecompositionFst<D>
64where
65	D: AsRef<[u8]>,
66{
67	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
68
69	fn subdivide<'a>(
70		&self,
71		token: SegmentedToken<'a>,
72	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
73		if token.is_known_word {
74			return UseOrSubdivide::Use(token);
75		}
76		let mut cuts = Vec::new();
77		let mut pos: usize = 0;
78
79		while let Some((_, length)) =
80			find_longest_prefix(&self.dictionary, &token.text.as_bytes()[pos..])
81		{
82			// abort if the matches have some space between them
83			cuts.push(length);
84			pos += length;
85			if !token.text.is_char_boundary(pos) && pos != token.text.len() {
86				eprintln!(
87					"Detected invalid utf8 in dictionary, not subdividing token: {:?}",
88					token.text
89				);
90				return UseOrSubdivide::Use(token);
91			}
92		}
93
94		if pos == token.text.len() {
95			let mut subsegments = Vec::<SegmentedToken>::with_capacity(cuts.len() + 1);
96			let mut text = token.text;
97			for pos in cuts {
98				// Every time we split here `text` gets shorter, which is
99				// why we can use the pushed lengths as positions here.
100				let (word, rest) = text.split_at(pos);
101				text = rest;
102				subsegments
103					.push(SegmentedToken::new_derived_from(word, &token).with_is_kown_word(true));
104			}
105			return UseOrSubdivide::Subdivide(subsegments.into_iter());
106		} else {
107			// If we didn't end at the end the token couldn't be split into parts of the wordlist -> return the token whole and unchanged.
108			return UseOrSubdivide::Use(token);
109		}
110	}
111}
112
113/// find the longest key that is prefix of the given value.
114///
115/// This was taken unmodified from charabia at 2025-05-03 commit `8523fa8`.
116#[inline]
117fn find_longest_prefix<D>(fst: &Fst<D>, value: &[u8]) -> Option<(u64, usize)>
118where
119	D: AsRef<[u8]>,
120{
121	let mut node = fst.root();
122	let mut out = Output::zero();
123	let mut last_match = None;
124	for (i, &b) in value.iter().enumerate() {
125		if let Some(trans_index) = node.find_input(b) {
126			let t = node.transition(trans_index);
127			node = fst.node(t.addr);
128			out = out.cat(t.out);
129			if node.is_final() {
130				last_match = Some((out.cat(node.final_output()).value(), i + 1));
131			}
132		} else {
133			return last_match;
134		}
135	}
136	last_match
137}
138
139#[cfg(test)]
140mod test {
141
142	use super::*;
143	use crate::SubdivisionMap;
144	use crate::initial_paragraph_splitter::InitialParagraphSplitter;
145
146	#[test]
147	fn test_decomposition_fst() {
148		let decomposer = DecompositionFst::from_dictionary(vec!["bar", "baz", "foo"]).unwrap();
149
150		let splitter = InitialParagraphSplitter::new("foobarbaz fooquux foo bazbaz");
151
152		let subsplitter = SubdivisionMap::new(splitter, |s| decomposer.subdivide(s));
153
154		let result: Vec<&str> = subsplitter.map(|s| s.text).collect();
155
156		let expected_result = vec![
157			"foo", "bar", "baz", " ", "fooquux", " ", "foo", " ", "baz", "baz",
158		];
159
160		assert_eq!(result, expected_result);
161	}
162}