unobtanium_segmenter/segmentation/
lingua_with_sentence_splitter.rs1use std::vec::IntoIter;
6
7use lingua::LanguageDetector;
8use lingua::LanguageDetectorBuilder;
9use unicode_segmentation::UnicodeSegmentation;
10
11use crate::SegmentedToken;
12use crate::UseOrSubdivide;
13use crate::language_detection::DetectionIterator;
14use crate::language_detection::detect_top_n_languages;
15use crate::language_detection::lingua_language_to_whatlang_language;
16use crate::segmentation::Segmenter;
17use crate::segmentation::UnicodeSentenceSplitter;
18
19pub struct LinguaLanguageBlockSentenceSplitter {
29 language_detector: LanguageDetector,
30}
31
32impl Default for LinguaLanguageBlockSentenceSplitter {
33 fn default() -> Self {
34 Self {
35 language_detector: LanguageDetectorBuilder::from_all_languages()
36 .with_preloaded_language_models()
37 .build(),
38 }
39 }
40}
41
42impl LinguaLanguageBlockSentenceSplitter {
43 pub fn new() -> Self {
45 Default::default()
46 }
47
48 pub fn new_with_builder(mut builder: LanguageDetectorBuilder) -> Self {
50 Self {
51 language_detector: builder.build(),
52 }
53 }
54}
55
56impl Segmenter for LinguaLanguageBlockSentenceSplitter {
57 type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
58
59 fn subdivide<'a>(
60 &self,
61 token: SegmentedToken<'a>,
62 ) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
63 let languages = detect_top_n_languages(&self.language_detector, 3, token.text);
64
65 println!("Languages: {languages:?}");
66
67 if languages.is_empty() {
68 return UnicodeSentenceSplitter::new().subdivide(token);
70 }
71
72 let detection_iterator = DetectionIterator::detect(
73 &LanguageDetectorBuilder::from_languages(&languages).build(),
75 token.text,
76 );
77 let mut sentence_iterator = token.text.split_sentence_bounds().map(|s| s.len());
78
79 let mut collection: Vec<SegmentedToken<'_>> = Vec::new();
80
81 let mut last_offset = 0;
82 let mut next_sentence_split_at = sentence_iterator.next().unwrap_or(token.text.len());
83
84 for (mut start_index, end_index, language) in detection_iterator {
85 println!(
86 "{start_index}..{end_index} next split: {next_sentence_split_at}, lang: {language:?}"
87 );
88 let language = language.and_then(lingua_language_to_whatlang_language);
89 if next_sentence_split_at > start_index && next_sentence_split_at < end_index {
90 collection.push(
91 SegmentedToken::new_derived_from(
92 &token.text[start_index..next_sentence_split_at],
93 &token,
94 )
95 .with_detected_language(language, true, 1.0),
96 );
97 collection.push(
98 SegmentedToken::new_end_of_sentence(
99 &token.text[next_sentence_split_at..next_sentence_split_at],
100 )
101 .with_detected_language(language, true, 1.0),
102 );
103
104 start_index = next_sentence_split_at;
105 next_sentence_split_at = sentence_iterator
106 .next()
107 .map(|n| n + next_sentence_split_at)
108 .unwrap_or_else(|| token.text.len());
109 }
110 collection.push(
111 SegmentedToken::new_derived_from(&token.text[start_index..end_index], &token)
112 .with_detected_language(language, true, 1.0),
113 );
114
115 if next_sentence_split_at == end_index {
116 collection.push(
117 SegmentedToken::new_end_of_sentence(
118 &token.text[next_sentence_split_at..next_sentence_split_at],
119 )
120 .with_detected_language(language, true, 1.0),
121 );
122 next_sentence_split_at = sentence_iterator
123 .next()
124 .map(|n| n + next_sentence_split_at)
125 .unwrap_or_else(|| token.text.len());
126 }
127
128 last_offset = end_index;
129 }
130
131 if last_offset != token.text.len() {
132 collection.push(SegmentedToken::new_derived_from(
133 &token.text[last_offset..],
134 &token,
135 ));
136 }
137
138 UseOrSubdivide::Subdivide(collection.into_iter())
139 }
140}
141
142#[cfg(test)]
143mod test {
144
145 use whatlang::Lang;
146
147 use std::time::Instant;
148
149 use super::*;
150
151 use crate::chain::ChainSegmenter;
152 use crate::chain::StartSegmentationChain;
153
154 #[test]
155 fn test_lingua_multilanguage_detection() {
156 let test_text = "Parlez-vous français? \
157 Ich spreche Französisch nur ein bisschen. \
158 A little bit is better than nothing. ";
159
160 let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();
161
162 for _ in 0..100 {
163 let result: Vec<(&str, Option<Lang>)> = test_text
164 .start_segmentation_chain()
165 .chain_segmenter(&lingua_segmenter)
166 .map(|t| (t.text, t.detected_language))
167 .collect();
168
169 let expected_tokens = vec![
170 ("Parlez-vous français? ", Some(Lang::Fra)),
171 ("", Some(Lang::Fra)),
172 (
173 "Ich spreche Französisch nur ein bisschen. ",
174 Some(Lang::Deu),
175 ),
176 ("", Some(Lang::Deu)),
177 ("A little bit is better than nothing. ", Some(Lang::Eng)),
178 ("", Some(Lang::Eng)),
179 ];
180
181 assert_eq!(result, expected_tokens);
182 }
183 }
184
185 #[test]
186 fn test_lingua_performance() {
187 let test_text = "Parlez-vous français? \
188 Ich spreche Französisch nur ein bisschen. \
189 A little bit is better than nothing.";
190
191 let start_instant = Instant::now();
192
193 let _result: Vec<(&str, Option<Lang>)> = test_text
194 .start_segmentation_chain()
195 .chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
196 .map(|t| (t.text, t.detected_language))
197 .collect();
198
199 let time_first_iteration = start_instant.elapsed();
200 let start_instant = Instant::now();
201
202 for _ in 0..100 {
203 let _result: Vec<(&str, Option<Lang>)> = test_text
204 .start_segmentation_chain()
205 .chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
206 .map(|t| (t.text, t.detected_language))
207 .collect();
208 }
209
210 let time_multiple_iterations = start_instant.elapsed();
211
212 assert!(
213 time_first_iteration > (time_multiple_iterations / 100),
214 "Subsequent iterations should be faster than the initial one, even when not keeping the struct around. {time_first_iteration:?} > {:?}",
215 time_multiple_iterations / 100
216 );
217 }
218}