unobtanium_segmenter/segmentation/
lingua_with_sentence_splitter.rs1use std::vec::IntoIter;
6
7use lingua::LanguageDetector;
8use lingua::LanguageDetectorBuilder;
9use unicode_segmentation::UnicodeSegmentation;
10
11use crate::SegmentedToken;
12use crate::UseOrSubdivide;
13use crate::language_detection::DetectionIterator;
14use crate::language_detection::detect_top_n_languages;
15use crate::language_detection::lingua_language_to_whatlang_language;
16use crate::segmentation::Segmenter;
17use crate::segmentation::UnicodeSentenceSplitter;
18
19pub struct LinguaLanguageBlockSentenceSplitter {
31 language_detector: LanguageDetector,
32}
33
34impl Default for LinguaLanguageBlockSentenceSplitter {
35 fn default() -> Self {
36 Self {
37 language_detector: LanguageDetectorBuilder::from_all_languages()
38 .with_preloaded_language_models()
39 .build(),
40 }
41 }
42}
43
44impl LinguaLanguageBlockSentenceSplitter {
45 pub fn new() -> Self {
47 Default::default()
48 }
49
50 pub fn new_with_builder(mut builder: LanguageDetectorBuilder) -> Self {
52 Self {
53 language_detector: builder.build(),
54 }
55 }
56}
57
58impl Segmenter for LinguaLanguageBlockSentenceSplitter {
59 type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
60
61 fn subdivide<'a>(
62 &self,
63 token: SegmentedToken<'a>,
64 ) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
65 let languages = detect_top_n_languages(&self.language_detector, 3, token.text);
66
67 if languages.is_empty() {
70 return UnicodeSentenceSplitter::new().subdivide(token);
72 }
73
74 let detection_iterator = DetectionIterator::detect(
75 &LanguageDetectorBuilder::from_languages(&languages).build(),
77 token.text,
78 );
79 let mut sentence_iterator = token.text.split_sentence_bounds().map(|s| s.len());
80
81 let mut collection: Vec<SegmentedToken<'_>> = Vec::new();
82
83 let mut last_offset = 0;
84 let mut next_sentence_split_at = sentence_iterator.next().unwrap_or(token.text.len());
85
86 for (mut start_index, end_index, language) in detection_iterator {
87 let language = language.and_then(lingua_language_to_whatlang_language);
91 if next_sentence_split_at > start_index && next_sentence_split_at < end_index {
92 collection.push(
93 SegmentedToken::new_derived_from(
94 &token.text[start_index..next_sentence_split_at],
95 &token,
96 )
97 .with_detected_language(language, true, 1.0),
98 );
99 collection.push(
100 SegmentedToken::new_end_of_sentence(
101 &token.text[next_sentence_split_at..next_sentence_split_at],
102 )
103 .with_detected_language(language, true, 1.0),
104 );
105
106 start_index = next_sentence_split_at;
107 next_sentence_split_at = sentence_iterator
108 .next()
109 .map(|n| n + next_sentence_split_at)
110 .unwrap_or_else(|| token.text.len());
111 }
112 collection.push(
113 SegmentedToken::new_derived_from(&token.text[start_index..end_index], &token)
114 .with_detected_language(language, true, 1.0),
115 );
116
117 if next_sentence_split_at == end_index {
118 collection.push(
119 SegmentedToken::new_end_of_sentence(
120 &token.text[next_sentence_split_at..next_sentence_split_at],
121 )
122 .with_detected_language(language, true, 1.0),
123 );
124 next_sentence_split_at = sentence_iterator
125 .next()
126 .map(|n| n + next_sentence_split_at)
127 .unwrap_or_else(|| token.text.len());
128 }
129
130 last_offset = end_index;
131 }
132
133 if last_offset != token.text.len() {
134 collection.push(SegmentedToken::new_derived_from(
135 &token.text[last_offset..],
136 &token,
137 ));
138 }
139
140 UseOrSubdivide::Subdivide(collection.into_iter())
141 }
142}
143
144#[cfg(test)]
145mod test {
146
147 use whatlang::Lang;
148
149 use std::time::Instant;
150
151 use super::*;
152
153 use crate::chain::ChainSegmenter;
154 use crate::chain::StartSegmentationChain;
155
156 #[test]
157 fn test_lingua_multilanguage_detection() {
158 let test_text = "Parlez-vous français? \
159 Ich spreche Französisch nur ein bisschen. \
160 A little bit is better than nothing. ";
161
162 let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();
163
164 for _ in 0..100 {
165 let result: Vec<(&str, Option<Lang>)> = test_text
166 .start_segmentation_chain()
167 .chain_segmenter(&lingua_segmenter)
168 .map(|t| (t.text, t.detected_language))
169 .collect();
170
171 let expected_tokens = vec![
172 ("Parlez-vous français? ", Some(Lang::Fra)),
173 ("", Some(Lang::Fra)),
174 (
175 "Ich spreche Französisch nur ein bisschen. ",
176 Some(Lang::Deu),
177 ),
178 ("", Some(Lang::Deu)),
179 ("A little bit is better than nothing. ", Some(Lang::Eng)),
180 ("", Some(Lang::Eng)),
181 ];
182
183 assert_eq!(result, expected_tokens);
184 }
185 }
186
187 #[test]
188 fn test_lingua_performance() {
189 let test_text = "Parlez-vous français? \
190 Ich spreche Französisch nur ein bisschen. \
191 A little bit is better than nothing.";
192
193 let start_instant = Instant::now();
194
195 let _result: Vec<(&str, Option<Lang>)> = test_text
196 .start_segmentation_chain()
197 .chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
198 .map(|t| (t.text, t.detected_language))
199 .collect();
200
201 let time_first_iteration = start_instant.elapsed();
202 let start_instant = Instant::now();
203
204 for _ in 0..100 {
205 let _result: Vec<(&str, Option<Lang>)> = test_text
206 .start_segmentation_chain()
207 .chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
208 .map(|t| (t.text, t.detected_language))
209 .collect();
210 }
211
212 let time_multiple_iterations = start_instant.elapsed();
213
214 assert!(
215 time_first_iteration > (time_multiple_iterations / 100),
216 "Subsequent iterations should be faster than the initial one, even when not keeping the struct around. {time_first_iteration:?} > {:?}",
217 time_multiple_iterations / 100
218 );
219 }
220}