unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use whatlang::Detector;

use crate::SegmentedToken;
use crate::augmentation::Augmenter;

/// Will run language and script detection using [whatlang].
///
/// This is recommended to be run on whole sentences before splitting them into words.
///
/// It is unlikely to yield desireable results on the word level as words are usually valid in more than one language.
#[derive(Debug, Clone, Default)]
pub struct AugmentationDetectLanguage {
	detector: Detector,
}

impl AugmentationDetectLanguage {
	/// Create a new AugmentationDetectLanguage instance.
	pub fn new() -> Self {
		Default::default()
	}

	/// Create a new AugmentationDetectLanguage instance with a custom whatlang Detector.
	pub fn new_with_detector(detector: Detector) -> Self {
		Self { detector }
	}
}

impl Augmenter for AugmentationDetectLanguage {
	#[allow(clippy::collapsible_if)]
	fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
		if !token.is_known_word {
			if let Some(info) = self.detector.detect(token.text) {
				// Only overwrite the detected language if more confident
				if token.detected_language_confidence < info.confidence() {
					token.detected_script = Some(info.script());
					token.detected_language = Some(info.lang());
					token.detected_language_confidence = info.confidence();
					token.is_detected_language_relible = info.is_reliable();
				}
			}
		}
		return token;
	}
}