Skip to main content

unobtanium_segmenter/segmentation/
lingua_with_sentence_splitter.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use std::vec::IntoIter;
6
7use lingua::LanguageDetector;
8use lingua::LanguageDetectorBuilder;
9use unicode_segmentation::UnicodeSegmentation;
10
11use crate::SegmentedToken;
12use crate::UseOrSubdivide;
13use crate::language_detection::DetectionIterator;
14use crate::language_detection::detect_top_n_languages;
15use crate::language_detection::lingua_language_to_whatlang_language;
16use crate::segmentation::Segmenter;
17use crate::segmentation::UnicodeSentenceSplitter;
18
19/// Combination of [LinguaLanguageBlockSplitter][crate::segmentation::LiguaLanguageBlockSplitter] and the [UnicodeSentenceSplitter].
20///
21/// This is intended to be used near/at the start of the segmentation chain.
22///
23/// In case no language is detected it falls back to only splitting sentences.
24///
25/// Performance: There is a significant spin-up cost involved when using this struct for the first time that won't happen on subsequent uses. This only happens once per program, independent of you keeping a specific instance or not. See the [lingua] documentation for RAM requirements.
26///
27/// Using low accuracy mode probably isn't worth it.
28///
29/// Language support: Currently this crate uses [whatlang::Lang] to communicate languages, so the language support is the intersection of whatlang and what [lingua::Language] both support.
30pub struct LinguaLanguageBlockSentenceSplitter {
31	language_detector: LanguageDetector,
32}
33
34impl Default for LinguaLanguageBlockSentenceSplitter {
35	fn default() -> Self {
36		Self {
37			language_detector: LanguageDetectorBuilder::from_all_languages()
38				.with_preloaded_language_models()
39				.build(),
40		}
41	}
42}
43
44impl LinguaLanguageBlockSentenceSplitter {
45	/// Create a new LiguaLanguageBlockSplitter instance that is configured to preload all languages on the first use.
46	pub fn new() -> Self {
47		Default::default()
48	}
49
50	/// Create a new LiguaLanguageBlockSplitter from a custom [LanguageDetectorBuilder].
51	pub fn new_with_builder(mut builder: LanguageDetectorBuilder) -> Self {
52		Self {
53			language_detector: builder.build(),
54		}
55	}
56}
57
58impl Segmenter for LinguaLanguageBlockSentenceSplitter {
59	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
60
61	fn subdivide<'a>(
62		&self,
63		token: SegmentedToken<'a>,
64	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
65		let languages = detect_top_n_languages(&self.language_detector, 3, token.text);
66
67		//println!("Languages: {languages:?}");
68
69		if languages.is_empty() {
70			// No lnaguage found: Only do sentence splitting
71			return UnicodeSentenceSplitter::new().subdivide(token);
72		}
73
74		let detection_iterator = DetectionIterator::detect(
75			// This will panic for any empty languages vec, keep the check above intact
76			&LanguageDetectorBuilder::from_languages(&languages).build(),
77			token.text,
78		);
79		let mut sentence_iterator = token.text.split_sentence_bounds().map(|s| s.len());
80
81		// Output segments
82		let mut collection: Vec<SegmentedToken<'_>> = Vec::new();
83
84		let mut last_offset = 0;
85		let mut next_sentence_split_at = sentence_iterator.next().unwrap_or(token.text.len());
86
87		for (mut start_index, end_index, language) in detection_iterator {
88			/*println!(
89				"{start_index}..{end_index} next split: {next_sentence_split_at}, lang: {language:?}"
90			);*/
91			let language = language.and_then(lingua_language_to_whatlang_language);
92			while next_sentence_split_at > start_index && next_sentence_split_at <= end_index {
93				//println!("Sentence split: {start_index}..{next_sentence_split_at}");
94				collection.push(
95					SegmentedToken::new_derived_from(
96						&token.text[start_index..next_sentence_split_at],
97						&token,
98					)
99					.with_detected_language(language, true, 1.0),
100				);
101				collection.push(
102					SegmentedToken::new_end_of_sentence(
103						&token.text[next_sentence_split_at..next_sentence_split_at],
104					)
105					.with_detected_language(language, true, 1.0),
106				);
107
108				start_index = next_sentence_split_at;
109				next_sentence_split_at = sentence_iterator
110					.next()
111					.map(|n| n + next_sentence_split_at)
112					.unwrap_or_else(|| token.text.len());
113			}
114			if start_index != end_index {
115				collection.push(
116					SegmentedToken::new_derived_from(&token.text[start_index..end_index], &token)
117						.with_detected_language(language, true, 1.0),
118				);
119			}
120
121			last_offset = end_index;
122		}
123
124		if last_offset != token.text.len() {
125			collection.push(SegmentedToken::new_derived_from(
126				&token.text[last_offset..],
127				&token,
128			));
129		}
130
131		UseOrSubdivide::Subdivide(collection.into_iter())
132	}
133}
134
135#[cfg(test)]
136mod test {
137
138	use whatlang::Lang;
139
140	use std::time::Instant;
141
142	use super::*;
143
144	use crate::chain::ChainSegmenter;
145	use crate::chain::StartSegmentationChain;
146
147	#[test]
148	fn test_lingua_multilanguage_detection() {
149		let test_text = "Parlez-vous français? \
150			Ich spreche Französisch nur ein bisschen. \
151			A little bit is better than nothing.   ";
152
153		let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();
154
155		let result: Vec<(&str, Option<Lang>, bool)> = test_text
156			.start_segmentation_chain()
157			.chain_segmenter(&lingua_segmenter)
158			.map(|t| (t.text, t.detected_language, t.is_end_of_sentence))
159			.collect();
160
161		let expected_tokens = vec![
162			("Parlez-vous français? ", Some(Lang::Fra), false),
163			("", Some(Lang::Fra), true),
164			(
165				"Ich spreche Französisch nur ein bisschen. ",
166				Some(Lang::Deu),
167				false,
168			),
169			("", Some(Lang::Deu), true),
170			(
171				"A little bit is better than nothing.   ",
172				Some(Lang::Eng),
173				false,
174			),
175			("", Some(Lang::Eng), true),
176		];
177
178		assert_eq!(result, expected_tokens);
179	}
180
181	#[test]
182	fn test_lingua_sentence_splitting() {
183		let test_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur eu tincidunt enim. Praesent ornare pharetra ligula, sit amet sodales elit gravida ut. In hac habitasse platea dictumst. Aliquam quam odio, tristique at venenatis ut, auctor sed nibh. Sed et sapien lobortis, sagittis lacus sed, posuere tellus. Donec tincidunt dictum tempor. Aenean nec nisl commodo, venenatis leo nec, cursus tellus. Mauris finibus facilisis ultrices. Quisque quis lobortis odio. Cras at nisi augue.";
184
185		let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();
186
187		let result: Vec<(&str, bool)> = test_text
188			.start_segmentation_chain()
189			.chain_segmenter(&lingua_segmenter)
190			.map(|t| (t.text, t.is_end_of_sentence))
191			.collect();
192
193		let expected_tokens = vec![
194			(
195				"Lorem ipsum dolor sit amet, consectetur adipiscing elit. ",
196				false,
197			),
198			("", true),
199			("Curabitur eu tincidunt enim. ", false),
200			("", true),
201			(
202				"Praesent ornare pharetra ligula, sit amet sodales elit gravida ut. ",
203				false,
204			),
205			("", true),
206			("In hac habitasse platea dictumst. ", false),
207			("", true),
208			(
209				"Aliquam quam odio, tristique at venenatis ut, auctor sed nibh. ",
210				false,
211			),
212			("", true),
213			(
214				"Sed et sapien lobortis, sagittis lacus sed, posuere tellus. ",
215				false,
216			),
217			("", true),
218			("Donec tincidunt dictum tempor. ", false),
219			("", true),
220			(
221				"Aenean nec nisl commodo, venenatis leo nec, cursus tellus. ",
222				false,
223			),
224			("", true),
225			("Mauris finibus facilisis ultrices. ", false),
226			("", true),
227			("Quisque quis lobortis odio. ", false),
228			("", true),
229			("Cras at nisi augue.", false),
230			("", true),
231		];
232
233		if result != expected_tokens {
234			eprintln!("Sentence splitting test failed!");
235			eprintln!("\nExpected output:");
236			for (n, line) in expected_tokens.iter().enumerate() {
237				eprintln!("* {n}: {line:?}");
238			}
239			eprintln!("\nGot output:");
240			for (n, line) in result.iter().enumerate() {
241				let is_ok = expected_tokens.get(n) == Some(line);
242				eprintln!("* {n}: {line:?} is_ok: {is_ok}");
243			}
244
245			panic!("Sentence splitting test failed: check stderr output.");
246		}
247	}
248
249	#[test]
250	fn test_lingua_performance() {
251		let test_text = "Parlez-vous français? \
252    Ich spreche Französisch nur ein bisschen. \
253    A little bit is better than nothing.";
254
255		let start_instant = Instant::now();
256
257		let _result: Vec<(&str, Option<Lang>)> = test_text
258			.start_segmentation_chain()
259			.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
260			.map(|t| (t.text, t.detected_language))
261			.collect();
262
263		let time_first_iteration = start_instant.elapsed();
264		let start_instant = Instant::now();
265
266		for _ in 0..100 {
267			let _result: Vec<(&str, Option<Lang>)> = test_text
268				.start_segmentation_chain()
269				.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
270				.map(|t| (t.text, t.detected_language))
271				.collect();
272		}
273
274		let time_multiple_iterations = start_instant.elapsed();
275
276		assert!(
277			time_first_iteration > (time_multiple_iterations / 100),
278			"Subsequent iterations should be faster than the initial one, even when not keeping the struct around. {time_first_iteration:?} > {:?}",
279			time_multiple_iterations / 100
280		);
281	}
282}