unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use std::vec::IntoIter;

use lingua::LanguageDetector;
use lingua::LanguageDetectorBuilder;

use crate::SegmentedToken;
use crate::UseOrSubdivide;
use crate::language_detection::DetectionIterator;
use crate::language_detection::detect_top_n_languages;
use crate::language_detection::lingua_language_to_whatlang_language;
use crate::segmentation::Segmenter;

/// Language splitter and detector.
///
/// This is intended to be used near/at the start of the segmentation chain.
///
/// Performance: There is a significant spin-up cost involved when using this struct for the first time that won't happen on subsequent uses. This only happens once per program, independent of you keeping a specific instance or not. See the [lingua] documentation for RAM requirements.
///
/// Using low accuracy mode probably isn't worth it.
///
/// Language support: Currently this crate uses [whatlang::Lang] to communicate languages, so the language support is the intersection of whatlang and what [lingua::Language] both support.
pub struct LinguaLanguageBlockSplitter {
	language_detector: LanguageDetector,
}

impl Default for LinguaLanguageBlockSplitter {
	fn default() -> Self {
		Self {
			language_detector: LanguageDetectorBuilder::from_all_languages()
				.with_preloaded_language_models()
				.build(),
		}
	}
}

impl LinguaLanguageBlockSplitter {
	/// Create a new LiguaLanguageBlockSplitter instance that is configured to preload all languages on the first use.
	pub fn new() -> Self {
		Default::default()
	}

	/// Create a new LiguaLanguageBlockSplitter from a custom [LanguageDetectorBuilder].
	pub fn new_with_builder(mut builder: LanguageDetectorBuilder) -> Self {
		Self {
			language_detector: builder.build(),
		}
	}
}

impl Segmenter for LinguaLanguageBlockSplitter {
	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;

	fn subdivide<'a>(
		&self,
		token: SegmentedToken<'a>,
	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
		let languages = detect_top_n_languages(&self.language_detector, 3, token.text);

		if languages.is_empty() {
			// No lnaguage found: Return token as is
			return UseOrSubdivide::Use(token);
		}

		let detection_iterator = DetectionIterator::detect(
			// This will panic for any empty languages vec, keep the check above intact
			&LanguageDetectorBuilder::from_languages(&languages).build(),
			token.text,
		);

		let result_list = self
			.language_detector
			.detect_multiple_languages_of(token.text);
		let mut collection: Vec<SegmentedToken<'_>> = Vec::with_capacity(result_list.len() * 2 + 1);

		let mut last_offset = 0;

		for (start_index, end_index, language) in detection_iterator {
			// Make sure there are no gaps in the output.
			if last_offset != start_index {
				collection.push(SegmentedToken::new_derived_from(
					&token.text[last_offset..start_index],
					&token,
				));
			}
			let mut new_token =
				SegmentedToken::new_derived_from(&token.text[start_index..end_index], &token);
			new_token.detected_language = language.and_then(lingua_language_to_whatlang_language);
			new_token.is_detected_language_relible = true;

			collection.push(new_token);

			last_offset = end_index;
		}

		if last_offset != token.text.len() {
			collection.push(SegmentedToken::new_derived_from(
				&token.text[last_offset..],
				&token,
			));
		}

		UseOrSubdivide::Subdivide(collection.into_iter())
	}
}

#[cfg(test)]
mod test {

	use std::time::Instant;

	use super::*;

	use crate::chain::ChainSegmenter;
	use crate::chain::StartSegmentationChain;

	use whatlang::Lang;

	#[test]
	fn test_lingua_multilanguage_detection() {
		let test_text = "Parlez-vous français? \
			Ich spreche Französisch nur ein bisschen. \
			A little bit is better than nothing.   ";

		let lingua_segmenter = LinguaLanguageBlockSplitter::new();

		for _ in 0..100 {
			let result: Vec<(&str, Option<Lang>)> = test_text
				.start_segmentation_chain()
				.chain_segmenter(&lingua_segmenter)
				.map(|t| (t.text, t.detected_language))
				.collect();

			let expected_tokens = vec![
				("Parlez-vous français? ", Some(Lang::Fra)),
				(
					"Ich spreche Französisch nur ein bisschen. ",
					Some(Lang::Deu),
				),
				("A little bit is better than nothing.   ", Some(Lang::Eng)),
			];

			assert_eq!(result, expected_tokens);
		}
	}

	#[test]
	fn test_lingua_performance() {
		let test_text = "Parlez-vous français? \
    Ich spreche Französisch nur ein bisschen. \
    A little bit is better than nothing.";

		let start_instant = Instant::now();

		let _result: Vec<(&str, Option<Lang>)> = test_text
			.start_segmentation_chain()
			.chain_segmenter(&LinguaLanguageBlockSplitter::new())
			.map(|t| (t.text, t.detected_language))
			.collect();

		let time_first_iteration = start_instant.elapsed();
		let start_instant = Instant::now();

		for _ in 0..100 {
			let _result: Vec<(&str, Option<Lang>)> = test_text
				.start_segmentation_chain()
				.chain_segmenter(&LinguaLanguageBlockSplitter::new())
				.map(|t| (t.text, t.detected_language))
				.collect();
		}

		let time_multiple_iterations = start_instant.elapsed();

		assert!(
			time_first_iteration > (time_multiple_iterations / 100),
			"Subsequent iterations should be faster than the initial one, even when not keeping the struct around. {time_first_iteration:?} > {:?}",
			time_multiple_iterations / 100
		);
	}
}