unobtanium_segmenter/segmentation/
lingua_with_sentence_splitter.rs1use std::vec::IntoIter;
6
7use lingua::LanguageDetector;
8use lingua::LanguageDetectorBuilder;
9use unicode_segmentation::UnicodeSegmentation;
10
11use crate::SegmentedToken;
12use crate::UseOrSubdivide;
13use crate::language_detection::DetectionIterator;
14use crate::language_detection::detect_top_n_languages;
15use crate::language_detection::lingua_language_to_whatlang_language;
16use crate::segmentation::Segmenter;
17use crate::segmentation::UnicodeSentenceSplitter;
18
19pub struct LinguaLanguageBlockSentenceSplitter {
31 language_detector: LanguageDetector,
32}
33
34impl Default for LinguaLanguageBlockSentenceSplitter {
35 fn default() -> Self {
36 Self {
37 language_detector: LanguageDetectorBuilder::from_all_languages()
38 .with_preloaded_language_models()
39 .build(),
40 }
41 }
42}
43
44impl LinguaLanguageBlockSentenceSplitter {
45 pub fn new() -> Self {
47 Default::default()
48 }
49
50 pub fn new_with_builder(mut builder: LanguageDetectorBuilder) -> Self {
52 Self {
53 language_detector: builder.build(),
54 }
55 }
56}
57
58impl Segmenter for LinguaLanguageBlockSentenceSplitter {
59 type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
60
61 fn subdivide<'a>(
62 &self,
63 token: SegmentedToken<'a>,
64 ) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
65 let languages = detect_top_n_languages(&self.language_detector, 3, token.text);
66
67 if languages.is_empty() {
70 return UnicodeSentenceSplitter::new().subdivide(token);
72 }
73
74 let detection_iterator = DetectionIterator::detect(
75 &LanguageDetectorBuilder::from_languages(&languages).build(),
77 token.text,
78 );
79 let mut sentence_iterator = token.text.split_sentence_bounds().map(|s| s.len());
80
81 let mut collection: Vec<SegmentedToken<'_>> = Vec::new();
83
84 let mut last_offset = 0;
85 let mut next_sentence_split_at = sentence_iterator.next().unwrap_or(token.text.len());
86
87 for (mut start_index, end_index, language) in detection_iterator {
88 let language = language.and_then(lingua_language_to_whatlang_language);
92 while next_sentence_split_at > start_index && next_sentence_split_at <= end_index {
93 collection.push(
95 SegmentedToken::new_derived_from(
96 &token.text[start_index..next_sentence_split_at],
97 &token,
98 )
99 .with_detected_language(language, true, 1.0),
100 );
101 collection.push(
102 SegmentedToken::new_end_of_sentence(
103 &token.text[next_sentence_split_at..next_sentence_split_at],
104 )
105 .with_detected_language(language, true, 1.0),
106 );
107
108 start_index = next_sentence_split_at;
109 next_sentence_split_at = sentence_iterator
110 .next()
111 .map(|n| n + next_sentence_split_at)
112 .unwrap_or_else(|| token.text.len());
113 }
114 if start_index != end_index {
115 collection.push(
116 SegmentedToken::new_derived_from(&token.text[start_index..end_index], &token)
117 .with_detected_language(language, true, 1.0),
118 );
119 }
120
121 last_offset = end_index;
122 }
123
124 if last_offset != token.text.len() {
125 collection.push(SegmentedToken::new_derived_from(
126 &token.text[last_offset..],
127 &token,
128 ));
129 }
130
131 UseOrSubdivide::Subdivide(collection.into_iter())
132 }
133}
134
135#[cfg(test)]
136mod test {
137
138 use whatlang::Lang;
139
140 use std::time::Instant;
141
142 use super::*;
143
144 use crate::chain::ChainSegmenter;
145 use crate::chain::StartSegmentationChain;
146
147 #[test]
148 fn test_lingua_multilanguage_detection() {
149 let test_text = "Parlez-vous français? \
150 Ich spreche Französisch nur ein bisschen. \
151 A little bit is better than nothing. ";
152
153 let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();
154
155 let result: Vec<(&str, Option<Lang>, bool)> = test_text
156 .start_segmentation_chain()
157 .chain_segmenter(&lingua_segmenter)
158 .map(|t| (t.text, t.detected_language, t.is_end_of_sentence))
159 .collect();
160
161 let expected_tokens = vec![
162 ("Parlez-vous français? ", Some(Lang::Fra), false),
163 ("", Some(Lang::Fra), true),
164 (
165 "Ich spreche Französisch nur ein bisschen. ",
166 Some(Lang::Deu),
167 false,
168 ),
169 ("", Some(Lang::Deu), true),
170 (
171 "A little bit is better than nothing. ",
172 Some(Lang::Eng),
173 false,
174 ),
175 ("", Some(Lang::Eng), true),
176 ];
177
178 assert_eq!(result, expected_tokens);
179 }
180
181 #[test]
182 fn test_lingua_sentence_splitting() {
183 let test_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur eu tincidunt enim. Praesent ornare pharetra ligula, sit amet sodales elit gravida ut. In hac habitasse platea dictumst. Aliquam quam odio, tristique at venenatis ut, auctor sed nibh. Sed et sapien lobortis, sagittis lacus sed, posuere tellus. Donec tincidunt dictum tempor. Aenean nec nisl commodo, venenatis leo nec, cursus tellus. Mauris finibus facilisis ultrices. Quisque quis lobortis odio. Cras at nisi augue.";
184
185 let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();
186
187 let result: Vec<(&str, bool)> = test_text
188 .start_segmentation_chain()
189 .chain_segmenter(&lingua_segmenter)
190 .map(|t| (t.text, t.is_end_of_sentence))
191 .collect();
192
193 let expected_tokens = vec![
194 (
195 "Lorem ipsum dolor sit amet, consectetur adipiscing elit. ",
196 false,
197 ),
198 ("", true),
199 ("Curabitur eu tincidunt enim. ", false),
200 ("", true),
201 (
202 "Praesent ornare pharetra ligula, sit amet sodales elit gravida ut. ",
203 false,
204 ),
205 ("", true),
206 ("In hac habitasse platea dictumst. ", false),
207 ("", true),
208 (
209 "Aliquam quam odio, tristique at venenatis ut, auctor sed nibh. ",
210 false,
211 ),
212 ("", true),
213 (
214 "Sed et sapien lobortis, sagittis lacus sed, posuere tellus. ",
215 false,
216 ),
217 ("", true),
218 ("Donec tincidunt dictum tempor. ", false),
219 ("", true),
220 (
221 "Aenean nec nisl commodo, venenatis leo nec, cursus tellus. ",
222 false,
223 ),
224 ("", true),
225 ("Mauris finibus facilisis ultrices. ", false),
226 ("", true),
227 ("Quisque quis lobortis odio. ", false),
228 ("", true),
229 ("Cras at nisi augue.", false),
230 ("", true),
231 ];
232
233 if result != expected_tokens {
234 eprintln!("Sentence splitting test failed!");
235 eprintln!("\nExpected output:");
236 for (n, line) in expected_tokens.iter().enumerate() {
237 eprintln!("* {n}: {line:?}");
238 }
239 eprintln!("\nGot output:");
240 for (n, line) in result.iter().enumerate() {
241 let is_ok = expected_tokens.get(n) == Some(line);
242 eprintln!("* {n}: {line:?} is_ok: {is_ok}");
243 }
244
245 panic!("Sentence splitting test failed: check stderr output.");
246 }
247 }
248
249 #[test]
250 fn test_lingua_performance() {
251 let test_text = "Parlez-vous français? \
252 Ich spreche Französisch nur ein bisschen. \
253 A little bit is better than nothing.";
254
255 let start_instant = Instant::now();
256
257 let _result: Vec<(&str, Option<Lang>)> = test_text
258 .start_segmentation_chain()
259 .chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
260 .map(|t| (t.text, t.detected_language))
261 .collect();
262
263 let time_first_iteration = start_instant.elapsed();
264 let start_instant = Instant::now();
265
266 for _ in 0..100 {
267 let _result: Vec<(&str, Option<Lang>)> = test_text
268 .start_segmentation_chain()
269 .chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
270 .map(|t| (t.text, t.detected_language))
271 .collect();
272 }
273
274 let time_multiple_iterations = start_instant.elapsed();
275
276 assert!(
277 time_first_iteration > (time_multiple_iterations / 100),
278 "Subsequent iterations should be faster than the initial one, even when not keeping the struct around. {time_first_iteration:?} > {:?}",
279 time_multiple_iterations / 100
280 );
281 }
282}