unobtanium_segmenter/augmentation/
detect_language.rs

1use whatlang::Detector;
2
3use crate::augmentation::Augmenter;
4use crate::SegmentedToken;
5
6/// Will run language and script detection using [whatlang].
7///
8/// This is recommended to be run on whole sentences before splitting them into words.
9///
10/// It is unlikely to yield desireable results on the word level as words are usually valid in more than one language.
11#[derive(Debug, Clone, Default)]
12pub struct AugmentationDetectLanguage {
13	detector: Detector,
14}
15
16impl AugmentationDetectLanguage {
17	/// Create a new AugmentationDetectLanguage instance.
18	pub fn new() -> Self {
19		Default::default()
20	}
21
22	/// Create a new AugmentationDetectLanguage instance with a custom whatlang Detector.
23	pub fn new_with_detector(detector: Detector) -> Self {
24		Self {
25			detector
26		}
27	}
28}
29
30impl Augmenter for AugmentationDetectLanguage {
31	fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
32		if !token.is_known_word {
33			if let Some(info) = self.detector.detect(token.text) {
34				// Only overwrite the detected language if more confident
35				if token.detected_language_confidence < info.confidence() {
36					token.detected_script = Some(info.script());
37					token.detected_language = Some(info.lang());
38					token.detected_language_confidence = info.confidence();
39					token.is_detected_language_relible = info.is_reliable();
40				}
41			}
42		}
43		return token;
44	}
45}