unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use std::vec::IntoIter;

use lingua::LanguageDetector;
use lingua::LanguageDetectorBuilder;
use unicode_segmentation::UnicodeSegmentation;

use crate::SegmentedToken;
use crate::UseOrSubdivide;
use crate::language_detection::DetectionIterator;
use crate::language_detection::detect_top_n_languages;
use crate::language_detection::lingua_language_to_whatlang_language;
use crate::segmentation::Segmenter;
use crate::segmentation::UnicodeSentenceSplitter;

/// Combination of [LinguaLanguageBlockSplitter][crate::segmentation::LiguaLanguageBlockSplitter] and the [UnicodeSentenceSplitter].
///
/// This is intended to be used near/at the start of the segmentation chain.
///
/// In case no language is detected it falls back to only splitting sentences.
///
/// Performance: There is a significant spin-up cost involved when using this struct for the first time that won't happen on subsequent uses. This only happens once per program, independent of you keeping a specific instance or not. See the [lingua] documentation for RAM requirements.
///
/// Using low accuracy mode probably isn't worth it.
///
/// Language support: Currently this crate uses [whatlang::Lang] to communicate languages, so the language support is the intersection of whatlang and what [lingua::Language] both support.
pub struct LinguaLanguageBlockSentenceSplitter {
	language_detector: LanguageDetector,
}

impl Default for LinguaLanguageBlockSentenceSplitter {
	fn default() -> Self {
		Self {
			language_detector: LanguageDetectorBuilder::from_all_languages()
				.with_preloaded_language_models()
				.build(),
		}
	}
}

impl LinguaLanguageBlockSentenceSplitter {
	/// Create a new LiguaLanguageBlockSplitter instance that is configured to preload all languages on the first use.
	pub fn new() -> Self {
		Default::default()
	}

	/// Create a new LiguaLanguageBlockSplitter from a custom [LanguageDetectorBuilder].
	pub fn new_with_builder(mut builder: LanguageDetectorBuilder) -> Self {
		Self {
			language_detector: builder.build(),
		}
	}
}

impl Segmenter for LinguaLanguageBlockSentenceSplitter {
	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;

	fn subdivide<'a>(
		&self,
		token: SegmentedToken<'a>,
	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
		let languages = detect_top_n_languages(&self.language_detector, 3, token.text);

		//println!("Languages: {languages:?}");

		if languages.is_empty() {
			// No lnaguage found: Only do sentence splitting
			return UnicodeSentenceSplitter::new().subdivide(token);
		}

		let detection_iterator = DetectionIterator::detect(
			// This will panic for any empty languages vec, keep the check above intact
			&LanguageDetectorBuilder::from_languages(&languages).build(),
			token.text,
		);
		let mut sentence_iterator = token.text.split_sentence_bounds().map(|s| s.len());

		// Output segments
		let mut collection: Vec<SegmentedToken<'_>> = Vec::new();

		let mut last_offset = 0;
		let mut next_sentence_split_at = sentence_iterator.next().unwrap_or(token.text.len());

		for (mut start_index, end_index, language) in detection_iterator {
			/*println!(
				"{start_index}..{end_index} next split: {next_sentence_split_at}, lang: {language:?}"
			);*/
			let language = language.and_then(lingua_language_to_whatlang_language);
			while next_sentence_split_at > start_index && next_sentence_split_at <= end_index {
				//println!("Sentence split: {start_index}..{next_sentence_split_at}");
				collection.push(
					SegmentedToken::new_derived_from(
						&token.text[start_index..next_sentence_split_at],
						&token,
					)
					.with_detected_language(language, true, 1.0),
				);
				collection.push(
					SegmentedToken::new_end_of_sentence(
						&token.text[next_sentence_split_at..next_sentence_split_at],
					)
					.with_detected_language(language, true, 1.0),
				);

				start_index = next_sentence_split_at;
				next_sentence_split_at = sentence_iterator
					.next()
					.map(|n| n + next_sentence_split_at)
					.unwrap_or_else(|| token.text.len());
			}
			if start_index != end_index {
				collection.push(
					SegmentedToken::new_derived_from(&token.text[start_index..end_index], &token)
						.with_detected_language(language, true, 1.0),
				);
			}

			last_offset = end_index;
		}

		if last_offset != token.text.len() {
			collection.push(SegmentedToken::new_derived_from(
				&token.text[last_offset..],
				&token,
			));
		}

		UseOrSubdivide::Subdivide(collection.into_iter())
	}
}

#[cfg(test)]
mod test {

	use whatlang::Lang;

	use std::time::Instant;

	use super::*;

	use crate::chain::ChainSegmenter;
	use crate::chain::StartSegmentationChain;

	#[test]
	fn test_lingua_multilanguage_detection() {
		let test_text = "Parlez-vous français? \
			Ich spreche Französisch nur ein bisschen. \
			A little bit is better than nothing.   ";

		let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();

		let result: Vec<(&str, Option<Lang>, bool)> = test_text
			.start_segmentation_chain()
			.chain_segmenter(&lingua_segmenter)
			.map(|t| (t.text, t.detected_language, t.is_end_of_sentence))
			.collect();

		let expected_tokens = vec![
			("Parlez-vous français? ", Some(Lang::Fra), false),
			("", Some(Lang::Fra), true),
			(
				"Ich spreche Französisch nur ein bisschen. ",
				Some(Lang::Deu),
				false,
			),
			("", Some(Lang::Deu), true),
			(
				"A little bit is better than nothing.   ",
				Some(Lang::Eng),
				false,
			),
			("", Some(Lang::Eng), true),
		];

		assert_eq!(result, expected_tokens);
	}

	#[test]
	fn test_lingua_sentence_splitting() {
		let test_text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur eu tincidunt enim. Praesent ornare pharetra ligula, sit amet sodales elit gravida ut. In hac habitasse platea dictumst. Aliquam quam odio, tristique at venenatis ut, auctor sed nibh. Sed et sapien lobortis, sagittis lacus sed, posuere tellus. Donec tincidunt dictum tempor. Aenean nec nisl commodo, venenatis leo nec, cursus tellus. Mauris finibus facilisis ultrices. Quisque quis lobortis odio. Cras at nisi augue.";

		let lingua_segmenter = LinguaLanguageBlockSentenceSplitter::new();

		let result: Vec<(&str, bool)> = test_text
			.start_segmentation_chain()
			.chain_segmenter(&lingua_segmenter)
			.map(|t| (t.text, t.is_end_of_sentence))
			.collect();

		let expected_tokens = vec![
			(
				"Lorem ipsum dolor sit amet, consectetur adipiscing elit. ",
				false,
			),
			("", true),
			("Curabitur eu tincidunt enim. ", false),
			("", true),
			(
				"Praesent ornare pharetra ligula, sit amet sodales elit gravida ut. ",
				false,
			),
			("", true),
			("In hac habitasse platea dictumst. ", false),
			("", true),
			(
				"Aliquam quam odio, tristique at venenatis ut, auctor sed nibh. ",
				false,
			),
			("", true),
			(
				"Sed et sapien lobortis, sagittis lacus sed, posuere tellus. ",
				false,
			),
			("", true),
			("Donec tincidunt dictum tempor. ", false),
			("", true),
			(
				"Aenean nec nisl commodo, venenatis leo nec, cursus tellus. ",
				false,
			),
			("", true),
			("Mauris finibus facilisis ultrices. ", false),
			("", true),
			("Quisque quis lobortis odio. ", false),
			("", true),
			("Cras at nisi augue.", false),
			("", true),
		];

		if result != expected_tokens {
			eprintln!("Sentence splitting test failed!");
			eprintln!("\nExpected output:");
			for (n, line) in expected_tokens.iter().enumerate() {
				eprintln!("* {n}: {line:?}");
			}
			eprintln!("\nGot output:");
			for (n, line) in result.iter().enumerate() {
				let is_ok = expected_tokens.get(n) == Some(line);
				eprintln!("* {n}: {line:?} is_ok: {is_ok}");
			}

			panic!("Sentence splitting test failed: check stderr output.");
		}
	}

	#[test]
	fn test_lingua_performance() {
		let test_text = "Parlez-vous français? \
    Ich spreche Französisch nur ein bisschen. \
    A little bit is better than nothing.";

		let start_instant = Instant::now();

		let _result: Vec<(&str, Option<Lang>)> = test_text
			.start_segmentation_chain()
			.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
			.map(|t| (t.text, t.detected_language))
			.collect();

		let time_first_iteration = start_instant.elapsed();
		let start_instant = Instant::now();

		for _ in 0..100 {
			let _result: Vec<(&str, Option<Lang>)> = test_text
				.start_segmentation_chain()
				.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new())
				.map(|t| (t.text, t.detected_language))
				.collect();
		}

		let time_multiple_iterations = start_instant.elapsed();

		assert!(
			time_first_iteration > (time_multiple_iterations / 100),
			"Subsequent iterations should be faster than the initial one, even when not keeping the struct around. {time_first_iteration:?} > {:?}",
			time_multiple_iterations / 100
		);
	}
}