unobtanium_segmenter/segmentation/
decomposition_fst.rs1use fst::Error;
6use fst::raw::Fst;
7use fst::raw::Output;
8
9use std::vec::IntoIter;
10
11use crate::SegmentedToken;
12use crate::UseOrSubdivide;
13use crate::segmentation::Segmenter;
14
15#[derive(Clone)]
23pub struct DecompositionFst<D: AsRef<[u8]>> {
24 dictionary: Fst<D>,
25}
26
27impl DecompositionFst<Vec<u8>> {
28 pub fn from_dictionary<I, P>(dict: I) -> Result<Self, Error>
40 where
41 I: IntoIterator<Item = P>,
42 P: AsRef<[u8]>,
43 {
44 let fst = Fst::from_iter_set(dict)?;
45
46 Ok(DecompositionFst::from_fst(fst))
47 }
48}
49
50impl<D> DecompositionFst<D>
51where
52 D: AsRef<[u8]>,
53{
54 pub fn from_fst(dictionary: Fst<D>) -> Self {
59 DecompositionFst { dictionary }
60 }
61}
62
63impl<D> Segmenter for DecompositionFst<D>
64where
65 D: AsRef<[u8]>,
66{
67 type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
68
69 fn subdivide<'a>(
70 &self,
71 token: SegmentedToken<'a>,
72 ) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
73 if token.is_known_word {
74 return UseOrSubdivide::Use(token);
75 }
76 let mut cuts = Vec::new();
77 let mut pos: usize = 0;
78
79 while let Some((_, length)) =
80 find_longest_prefix(&self.dictionary, &token.text.as_bytes()[pos..])
81 {
82 cuts.push(length);
84 pos += length;
85 if !token.text.is_char_boundary(pos) && pos != token.text.len() {
86 eprintln!(
87 "Detected invalid utf8 in dictionary, not subdividing token: {:?}",
88 token.text
89 );
90 return UseOrSubdivide::Use(token);
91 }
92 }
93
94 if pos == token.text.len() {
95 let mut subsegments = Vec::<SegmentedToken>::with_capacity(cuts.len() + 1);
96 let mut text = token.text;
97 for pos in cuts {
98 let (word, rest) = text.split_at(pos);
101 text = rest;
102 subsegments
103 .push(SegmentedToken::new_derived_from(word, &token).with_is_kown_word(true));
104 }
105 return UseOrSubdivide::Subdivide(subsegments.into_iter());
106 } else {
107 return UseOrSubdivide::Use(token);
109 }
110 }
111}
112
113#[inline]
117fn find_longest_prefix<D>(fst: &Fst<D>, value: &[u8]) -> Option<(u64, usize)>
118where
119 D: AsRef<[u8]>,
120{
121 let mut node = fst.root();
122 let mut out = Output::zero();
123 let mut last_match = None;
124 for (i, &b) in value.iter().enumerate() {
125 if let Some(trans_index) = node.find_input(b) {
126 let t = node.transition(trans_index);
127 node = fst.node(t.addr);
128 out = out.cat(t.out);
129 if node.is_final() {
130 last_match = Some((out.cat(node.final_output()).value(), i + 1));
131 }
132 } else {
133 return last_match;
134 }
135 }
136 last_match
137}
138
139#[cfg(test)]
140mod test {
141
142 use super::*;
143 use crate::SubdivisionMap;
144 use crate::initial_paragraph_splitter::InitialParagraphSplitter;
145
146 #[test]
147 fn test_decomposition_fst() {
148 let decomposer = DecompositionFst::from_dictionary(vec!["bar", "baz", "foo"]).unwrap();
149
150 let splitter = InitialParagraphSplitter::new("foobarbaz fooquux foo bazbaz");
151
152 let subsplitter = SubdivisionMap::new(splitter, |s| decomposer.subdivide(s));
153
154 let result: Vec<&str> = subsplitter.map(|s| s.text).collect();
155
156 let expected_result = vec![
157 "foo", "bar", "baz", " ", "fooquux", " ", "foo", " ", "baz", "baz",
158 ];
159
160 assert_eq!(result, expected_result);
161 }
162}