Skip to main content

unobtanium_segmenter/segmentation/
lingua_with_sentence_splitter.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use std::vec::IntoIter;
6
7use lingua::LanguageDetector;
8use lingua::LanguageDetectorBuilder;
9use unicode_segmentation::UnicodeSegmentation;
10
11use crate::SegmentedToken;
12use crate::UseOrSubdivide;
13use crate::language_detection::DetectionIterator;
14use crate::language_detection::detect_top_n_languages;
15use crate::language_detection::lingua_language_to_whatlang_language;
16use crate::segmentation::Segmenter;
17use crate::segmentation::UnicodeSentenceSplitter;
18
19/// Language splitter and detector.
20///
21/// This is intended to be used near/at the start of the segmentation chain.
22///
23/// Performance: There is a significant spin-up cost involved when using this struct for the first time that won't happen on subsequent uses. This only happens once per program, independent of you keeping a specific instance or not. See the [lingua] documentation for RAM requirements.
24///
25/// Using low accuracy mode probably isn't worth it.
26///
27/// Language support: Currently this crate uses [whatlang::Lang] to communicate languages, so the language support is the intersection of whatlang and what [lingua::Language] both support.
28pub struct LinguaLanguageBlockSentenceSplitter {
29	language_detector: LanguageDetector,
30}
31
32impl Default for LinguaLanguageBlockSentenceSplitter {
33	fn default() -> Self {
34		Self {
35			language_detector: LanguageDetectorBuilder::from_all_languages()
36				.with_preloaded_language_models()
37				.build(),
38		}
39	}
40}
41
42impl LinguaLanguageBlockSentenceSplitter {
43	/// Create a new LiguaLanguageBlockSplitter instance that is configured to preload all languages on the first use.
44	pub fn new() -> Self {
45		Default::default()
46	}
47
48	/// Create a new LiguaLanguageBlockSplitter from a custom [LanguageDetectorBuilder].
49	pub fn new_with_builder(mut builder: LanguageDetectorBuilder) -> Self {
50		Self {
51			language_detector: builder.build(),
52		}
53	}
54}
55
56impl Segmenter for LinguaLanguageBlockSentenceSplitter {
57	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
58
59	fn subdivide<'a>(
60		&self,
61		token: SegmentedToken<'a>,
62	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
63		let languages = detect_top_n_languages(&self.language_detector, 3, token.text);
64
65		println!("Languages: {languages:?}");
66
67		if languages.is_empty() {
68			// No lnaguage found: Only do sentence splitting
69			return UnicodeSentenceSplitter::new().subdivide(token);
70		}
71
72		let detection_iterator = DetectionIterator::detect(
73			// This will panic for any empty languages vec, keep the check above intact
74			&LanguageDetectorBuilder::from_languages(&languages).build(),
75			token.text,
76		);
77		let mut sentence_iterator = token.text.split_sentence_bounds().map(|s| s.len());
78
79		let mut collection: Vec<SegmentedToken<'_>> = Vec::new();
80
81		let mut last_offset = 0;
82		let mut next_sentence_split_at = sentence_iterator.next().unwrap_or(token.text.len());
83
84		for (mut start_index, end_index, language) in detection_iterator {
85			println!(
86				"{start_index}..{end_index} next split: {next_sentence_split_at}, lang: {language:?}"
87			);
88			let language = language.and_then(lingua_language_to_whatlang_language);
89			if next_sentence_split_at > start_index && next_sentence_split_at < end_index {
90				collection.push(
91					SegmentedToken::new_derived_from(
92						&token.text[start_index..next_sentence_split_at],
93						&token,
94					)
95					.with_detected_language(language, true, 1.0),
96				);
97				collection.push(
98					SegmentedToken::new_end_of_sentence(
99						&token.text[next_sentence_split_at..next_sentence_split_at],
100					)
101					.with_detected_language(language, true, 1.0),
102				);
103
104				start_index = next_sentence_split_at;
105				next_sentence_split_at = sentence_iterator
106					.next()
107					.map(|n| n + next_sentence_split_at)
108					.unwrap_or_else(|| token.text.len());
109			}
110			collection.push(
111				SegmentedToken::new_derived_from(&token.text[start_index..end_index], &token)
112					.with_detected_language(language, true, 1.0),
113			);
114
115			if next_sentence_split_at == end_index {
116				collection.push(
117					SegmentedToken::new_end_of_sentence(
118						&token.text[next_sentence_split_at..next_sentence_split_at],
119					)
120					.with_detected_language(language, true, 1.0),
121				);
122				next_sentence_split_at = sentence_iterator
123					.next()
124					.map(|n| n + next_sentence_split_at)
125					.unwrap_or_else(|| token.text.len());
126			}
127
128			last_offset = end_index;
129		}
130
131		if last_offset != token.text.len() {
132			collection.push(SegmentedToken::new_derived_from(
133				&token.text[last_offset..],
134				&token,
135			));
136		}
137
138		UseOrSubdivide::Subdivide(collection.into_iter())
139	}
140}
141
142#[cfg(test)]
143mod test {
144
145	use whatlang::Lang;
146
147	use std::time::Instant;
148
149	use super::*;
150
151	use crate::chain::ChainSegmenter;
152	use crate::chain::StartSegmentationChain;
153
154	#[test]
155	fn test_lingua_multilanguage_detection() {
156		let test_text = "Parlez-vous français? \
157			Ich spreche Französisch nur ein bisschen. \
158			A little bit is better than nothing.   ";
159
160		let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();
161
162		for _ in 0..100 {
163			let result: Vec<(&str, Option<Lang>)> = test_text
164				.start_segmentation_chain()
165				.chain_segmenter(&lingua_segmenter)
166				.map(|t| (t.text, t.detected_language))
167				.collect();
168
169			let expected_tokens = vec![
170				("Parlez-vous français? ", Some(Lang::Fra)),
171				("", Some(Lang::Fra)),
172				(
173					"Ich spreche Französisch nur ein bisschen. ",
174					Some(Lang::Deu),
175				),
176				("", Some(Lang::Deu)),
177				("A little bit is better than nothing.   ", Some(Lang::Eng)),
178				("", Some(Lang::Eng)),
179			];
180
181			assert_eq!(result, expected_tokens);
182		}
183	}
184
185	#[test]
186	fn test_lingua_performance() {
187		let test_text = "Parlez-vous français? \
188    Ich spreche Französisch nur ein bisschen. \
189    A little bit is better than nothing.";
190
191		let start_instant = Instant::now();
192
193		let _result: Vec<(&str, Option<Lang>)> = test_text
194			.start_segmentation_chain()
195			.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
196			.map(|t| (t.text, t.detected_language))
197			.collect();
198
199		let time_first_iteration = start_instant.elapsed();
200		let start_instant = Instant::now();
201
202		for _ in 0..100 {
203			let _result: Vec<(&str, Option<Lang>)> = test_text
204				.start_segmentation_chain()
205				.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
206				.map(|t| (t.text, t.detected_language))
207				.collect();
208		}
209
210		let time_multiple_iterations = start_instant.elapsed();
211
212		assert!(
213			time_first_iteration > (time_multiple_iterations / 100),
214			"Subsequent iterations should be faster than the initial one, even when not keeping the struct around. {time_first_iteration:?} > {:?}",
215			time_multiple_iterations / 100
216		);
217	}
218}