Skip to main content

unobtanium_segmenter/segmentation/
lingua_with_sentence_splitter.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use std::vec::IntoIter;
6
7use lingua::LanguageDetector;
8use lingua::LanguageDetectorBuilder;
9use unicode_segmentation::UnicodeSegmentation;
10
11use crate::SegmentedToken;
12use crate::UseOrSubdivide;
13use crate::language_detection::DetectionIterator;
14use crate::language_detection::detect_top_n_languages;
15use crate::language_detection::lingua_language_to_whatlang_language;
16use crate::segmentation::Segmenter;
17use crate::segmentation::UnicodeSentenceSplitter;
18
19/// Combination of [LinguaLanguageBlockSplitter][crate::segmentation::LiguaLanguageBlockSplitter] and the [UnicodeSentenceSplitter].
20///
21/// This is intended to be used near/at the start of the segmentation chain.
22///
23/// In case no language is detected it falls back to only splitting sentences.
24///
25/// Performance: There is a significant spin-up cost involved when using this struct for the first time that won't happen on subsequent uses. This only happens once per program, independent of you keeping a specific instance or not. See the [lingua] documentation for RAM requirements.
26///
27/// Using low accuracy mode probably isn't worth it.
28///
29/// Language support: Currently this crate uses [whatlang::Lang] to communicate languages, so the language support is the intersection of whatlang and what [lingua::Language] both support.
30pub struct LinguaLanguageBlockSentenceSplitter {
31	language_detector: LanguageDetector,
32}
33
34impl Default for LinguaLanguageBlockSentenceSplitter {
35	fn default() -> Self {
36		Self {
37			language_detector: LanguageDetectorBuilder::from_all_languages()
38				.with_preloaded_language_models()
39				.build(),
40		}
41	}
42}
43
44impl LinguaLanguageBlockSentenceSplitter {
45	/// Create a new LiguaLanguageBlockSplitter instance that is configured to preload all languages on the first use.
46	pub fn new() -> Self {
47		Default::default()
48	}
49
50	/// Create a new LiguaLanguageBlockSplitter from a custom [LanguageDetectorBuilder].
51	pub fn new_with_builder(mut builder: LanguageDetectorBuilder) -> Self {
52		Self {
53			language_detector: builder.build(),
54		}
55	}
56}
57
58impl Segmenter for LinguaLanguageBlockSentenceSplitter {
59	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
60
61	fn subdivide<'a>(
62		&self,
63		token: SegmentedToken<'a>,
64	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
65		let languages = detect_top_n_languages(&self.language_detector, 3, token.text);
66
67		//println!("Languages: {languages:?}");
68
69		if languages.is_empty() {
70			// No lnaguage found: Only do sentence splitting
71			return UnicodeSentenceSplitter::new().subdivide(token);
72		}
73
74		let detection_iterator = DetectionIterator::detect(
75			// This will panic for any empty languages vec, keep the check above intact
76			&LanguageDetectorBuilder::from_languages(&languages).build(),
77			token.text,
78		);
79		let mut sentence_iterator = token.text.split_sentence_bounds().map(|s| s.len());
80
81		let mut collection: Vec<SegmentedToken<'_>> = Vec::new();
82
83		let mut last_offset = 0;
84		let mut next_sentence_split_at = sentence_iterator.next().unwrap_or(token.text.len());
85
86		for (mut start_index, end_index, language) in detection_iterator {
87			/*println!(
88				"{start_index}..{end_index} next split: {next_sentence_split_at}, lang: {language:?}"
89			);*/
90			let language = language.and_then(lingua_language_to_whatlang_language);
91			if next_sentence_split_at > start_index && next_sentence_split_at < end_index {
92				collection.push(
93					SegmentedToken::new_derived_from(
94						&token.text[start_index..next_sentence_split_at],
95						&token,
96					)
97					.with_detected_language(language, true, 1.0),
98				);
99				collection.push(
100					SegmentedToken::new_end_of_sentence(
101						&token.text[next_sentence_split_at..next_sentence_split_at],
102					)
103					.with_detected_language(language, true, 1.0),
104				);
105
106				start_index = next_sentence_split_at;
107				next_sentence_split_at = sentence_iterator
108					.next()
109					.map(|n| n + next_sentence_split_at)
110					.unwrap_or_else(|| token.text.len());
111			}
112			collection.push(
113				SegmentedToken::new_derived_from(&token.text[start_index..end_index], &token)
114					.with_detected_language(language, true, 1.0),
115			);
116
117			if next_sentence_split_at == end_index {
118				collection.push(
119					SegmentedToken::new_end_of_sentence(
120						&token.text[next_sentence_split_at..next_sentence_split_at],
121					)
122					.with_detected_language(language, true, 1.0),
123				);
124				next_sentence_split_at = sentence_iterator
125					.next()
126					.map(|n| n + next_sentence_split_at)
127					.unwrap_or_else(|| token.text.len());
128			}
129
130			last_offset = end_index;
131		}
132
133		if last_offset != token.text.len() {
134			collection.push(SegmentedToken::new_derived_from(
135				&token.text[last_offset..],
136				&token,
137			));
138		}
139
140		UseOrSubdivide::Subdivide(collection.into_iter())
141	}
142}
143
144#[cfg(test)]
145mod test {
146
147	use whatlang::Lang;
148
149	use std::time::Instant;
150
151	use super::*;
152
153	use crate::chain::ChainSegmenter;
154	use crate::chain::StartSegmentationChain;
155
156	#[test]
157	fn test_lingua_multilanguage_detection() {
158		let test_text = "Parlez-vous français? \
159			Ich spreche Französisch nur ein bisschen. \
160			A little bit is better than nothing.   ";
161
162		let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();
163
164		for _ in 0..100 {
165			let result: Vec<(&str, Option<Lang>)> = test_text
166				.start_segmentation_chain()
167				.chain_segmenter(&lingua_segmenter)
168				.map(|t| (t.text, t.detected_language))
169				.collect();
170
171			let expected_tokens = vec![
172				("Parlez-vous français? ", Some(Lang::Fra)),
173				("", Some(Lang::Fra)),
174				(
175					"Ich spreche Französisch nur ein bisschen. ",
176					Some(Lang::Deu),
177				),
178				("", Some(Lang::Deu)),
179				("A little bit is better than nothing.   ", Some(Lang::Eng)),
180				("", Some(Lang::Eng)),
181			];
182
183			assert_eq!(result, expected_tokens);
184		}
185	}
186
187	#[test]
188	fn test_lingua_performance() {
189		let test_text = "Parlez-vous français? \
190    Ich spreche Französisch nur ein bisschen. \
191    A little bit is better than nothing.";
192
193		let start_instant = Instant::now();
194
195		let _result: Vec<(&str, Option<Lang>)> = test_text
196			.start_segmentation_chain()
197			.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
198			.map(|t| (t.text, t.detected_language))
199			.collect();
200
201		let time_first_iteration = start_instant.elapsed();
202		let start_instant = Instant::now();
203
204		for _ in 0..100 {
205			let _result: Vec<(&str, Option<Lang>)> = test_text
206				.start_segmentation_chain()
207				.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
208				.map(|t| (t.text, t.detected_language))
209				.collect();
210		}
211
212		let time_multiple_iterations = start_instant.elapsed();
213
214		assert!(
215			time_first_iteration > (time_multiple_iterations / 100),
216			"Subsequent iterations should be faster than the initial one, even when not keeping the struct around. {time_first_iteration:?} > {:?}",
217			time_multiple_iterations / 100
218		);
219	}
220}