unobtanium_segmenter/augmentation/
detect_language.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use whatlang::Detector;
6
7use crate::SegmentedToken;
8use crate::augmentation::Augmenter;
9
10/// Will run language and script detection using [whatlang].
11///
12/// This is recommended to be run on whole sentences before splitting them into words.
13///
14/// It is unlikely to yield desireable results on the word level as words are usually valid in more than one language.
15#[derive(Debug, Clone, Default)]
16pub struct AugmentationDetectLanguage {
17	detector: Detector,
18}
19
20impl AugmentationDetectLanguage {
21	/// Create a new AugmentationDetectLanguage instance.
22	pub fn new() -> Self {
23		Default::default()
24	}
25
26	/// Create a new AugmentationDetectLanguage instance with a custom whatlang Detector.
27	pub fn new_with_detector(detector: Detector) -> Self {
28		Self { detector }
29	}
30}
31
32impl Augmenter for AugmentationDetectLanguage {
33	#[allow(clippy::collapsible_if)]
34	fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
35		if !token.is_known_word {
36			if let Some(info) = self.detector.detect(token.text) {
37				// Only overwrite the detected language if more confident
38				if token.detected_language_confidence < info.confidence() {
39					token.detected_script = Some(info.script());
40					token.detected_language = Some(info.lang());
41					token.detected_language_confidence = info.confidence();
42					token.is_detected_language_relible = info.is_reliable();
43				}
44			}
45		}
46		return token;
47	}
48}