unobtanium_segmenter/augmentation/
detect_language.rs

1use whatlang::detect;
2
3use crate::augmentation::Augmenter;
4use crate::SegmentedToken;
5
6/// Will run language and script detection using [whatlang].
7///
8/// This is recommended to be run on whole sentences before splitting them into words.
9///
10/// It is unlikely to yield desireable results on the word level as words are usually valid in more than one language.
11#[derive(Debug, Clone, Default)]
12pub struct AugmentationDetectLanguage {}
13
14impl AugmentationDetectLanguage {
15	/// Create a new AugmentationDetectLanguage instance.
16	pub fn new() -> Self {
17		Default::default()
18	}
19}
20
21impl Augmenter for AugmentationDetectLanguage {
22	fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
23		if !token.is_known_word {
24			if let Some(info) = detect(token.text) {
25				// Only overwrite the detected language if more confident
26				if token.detected_language_confidence < info.confidence() {
27					token.detected_script = Some(info.script());
28					token.detected_language = Some(info.lang());
29					token.detected_language_confidence = info.confidence();
30					token.is_detected_language_relible = info.is_reliable();
31				}
32			}
33		}
34		return token;
35	}
36}