unobtanium_segmenter/segmentation/
decomposition_fst.rs1use fst::raw::Fst;
2use fst::raw::Output;
3use fst::Error;
4
5use std::vec::IntoIter;
6
7use crate::segmentation::Segmenter;
8use crate::SegmentedToken;
9use crate::UseOrSubdivide;
10
11#[derive(Clone)]
19pub struct DecompositionFst<D: AsRef<[u8]>> {
20 dictionary: Fst<D>,
21}
22
23impl DecompositionFst<Vec<u8>> {
24 pub fn from_dictionary<I, P>(dict: I) -> Result<Self, Error>
36 where
37 I: IntoIterator<Item = P>,
38 P: AsRef<[u8]>,
39 {
40 let fst = Fst::from_iter_set(dict)?;
41
42 Ok(DecompositionFst::from_fst(fst))
43 }
44}
45
46impl<D> DecompositionFst<D>
47where
48 D: AsRef<[u8]>,
49{
50 pub fn from_fst(dictionary: Fst<D>) -> Self {
55 DecompositionFst { dictionary }
56 }
57}
58
59impl<D> Segmenter for DecompositionFst<D>
60where
61 D: AsRef<[u8]>,
62{
63 type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
64
65 fn subdivide<'a>(
66 &self,
67 token: SegmentedToken<'a>,
68 ) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
69 if token.is_known_word {
70 return UseOrSubdivide::Use(token);
71 }
72 let mut cuts = Vec::new();
73 let mut pos: usize = 0;
74
75 while let Some((_, length)) =
76 find_longest_prefix(&self.dictionary, &token.text.as_bytes()[pos..])
77 {
78 cuts.push(length);
80 pos += length;
81 if !token.text.is_char_boundary(pos) && pos != token.text.len() {
82 eprintln!(
83 "Detected invalid utf8 in dictionary, not subdividing token: {:?}",
84 token.text
85 );
86 return UseOrSubdivide::Use(token);
87 }
88 }
89
90 if pos == token.text.len() {
91 let mut subsegments = Vec::<SegmentedToken>::with_capacity(cuts.len() + 1);
92 let mut text = token.text;
93 for pos in cuts {
94 let (word, rest) = text.split_at(pos);
97 text = rest;
98 subsegments
99 .push(SegmentedToken::new_derived_from(word, &token).with_is_kown_word(true));
100 }
101 return UseOrSubdivide::Subdivide(subsegments.into_iter());
102 } else {
103 return UseOrSubdivide::Use(token);
105 }
106 }
107}
108
109#[inline]
113fn find_longest_prefix<D>(fst: &Fst<D>, value: &[u8]) -> Option<(u64, usize)>
114where
115 D: AsRef<[u8]>,
116{
117 let mut node = fst.root();
118 let mut out = Output::zero();
119 let mut last_match = None;
120 for (i, &b) in value.iter().enumerate() {
121 if let Some(trans_index) = node.find_input(b) {
122 let t = node.transition(trans_index);
123 node = fst.node(t.addr);
124 out = out.cat(t.out);
125 if node.is_final() {
126 last_match = Some((out.cat(node.final_output()).value(), i + 1));
127 }
128 } else {
129 return last_match;
130 }
131 }
132 last_match
133}
134
135#[cfg(test)]
136mod test {
137
138 use super::*;
139 use crate::initial_paragraph_splitter::InitialParagraphSplitter;
140 use crate::SubdivisionMap;
141
142 #[test]
143 fn test_decomposition_fst() {
144 let decomposer = DecompositionFst::from_dictionary(vec!["bar", "baz", "foo"]).unwrap();
145
146 let splitter = InitialParagraphSplitter::new("foobarbaz fooquux foo bazbaz");
147
148 let subsplitter = SubdivisionMap::new(splitter, |s| decomposer.subdivide(s));
149
150 let result: Vec<&str> = subsplitter.map(|s| s.text).collect();
151
152 let expected_result = vec![
153 "foo", "bar", "baz", " ", "fooquux", " ", "foo", " ", "baz", "baz",
154 ];
155
156 assert_eq!(result, expected_result);
157 }
158}