unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use rust_stemmers::Algorithm;
use rust_stemmers::Stemmer;
use whatlang::Lang;

use crate::SegmentedToken;
use crate::SegmentedTokenKind;
use crate::augmentation::Augmenter;

/// Will run stemming with the language tagged onto the token if an algorithm is available.
///
/// This uses the [rust_stemmers] crate under the hood.
///
/// This is recommended to be run after an [AugmentationDetectLanguage][crate::augmentation::AugmentationDetectLanguage] has been used, it will not do anything if no language metadata is available!
///
/// If you need lowercase normalization, do that **before** this normalizer and set `process_already_normalized` to `true`. This is because some normalizers can't handle SCREAMING CASE.
///
/// Tokens will be ignored if:
/// * They are known to not be an [SegmentedTokenKind::AlphaNumeric]
/// * They already have `normalized_text` set when `process_already_normalized` is `false` (default)
///
/// If the tokens [normalization_language](SegmentedToken::normalization_language) is already set to a `Some` value that one wil be used and the detected language ignored.
#[derive(Debug, Clone)]
pub struct NormalizationRustStemmers {
	/// Thereshold above which the flag about the lnguage detection flagging itself as reliable is ignored and the detected lnguage used for normalization anyway.
	/// Setting this can help with shorter texts.
	///
	/// 1.0 which translates to never ignore the flag.
	/// 0.0 would mean to always ignore it.
	///
	/// Default is 0.4 as that is usually "good enough" for correct stemming.
	pub anyway_above_confidence: f64,

	/// Wheter to process tokens that are already normalized.
	/// You want to enable this, if your pipeline does some generic preprocessing like lowercasing.
	///
	/// Default is `false` for backwards compatibility.
	pub process_already_normalized: bool,
}

impl NormalizationRustStemmers {
	/// Create a new NormalizationRustStemmers instance with the default settings.
	pub fn new() -> Self {
		Default::default()
	}

	/// Adjust the value of [anyway_above_confidence][Self::anyway_above_confidence] builder style.
	pub fn set_anyway_above_confidence(mut self, anyway_above_confidence: f64) -> Self {
		self.anyway_above_confidence = anyway_above_confidence;
		return self;
	}

	/// Adjust the value of [process_already_normalized][Self::process_already_normalized] builder style.
	pub fn set_process_already_normalized(mut self, process_already_normalized: bool) -> Self {
		self.process_already_normalized = process_already_normalized;
		return self;
	}
}

impl Default for NormalizationRustStemmers {
	fn default() -> Self {
		Self {
			anyway_above_confidence: 0.4,
			process_already_normalized: false,
		}
	}
}

impl Augmenter for NormalizationRustStemmers {
	#[allow(clippy::collapsible_if)]
	fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
		if !matches!(token.kind, Some(SegmentedTokenKind::AlphaNumeric) | None) {
			return token;
		}
		if token.was_normalized() && !self.process_already_normalized {
			return token;
		}
		if let Some(algorithm) = token
			.normalization_language
			.and_then(get_stemming_algorithm_for_lang)
		{
			let stemmer = Stemmer::create(algorithm);
			token.update_normalized_string(
				stemmer.stem(token.get_text_prefer_normalized()).to_string(),
				token.normalization_language,
			);
		} else if token.is_detected_language_relible
			|| token.detected_language_confidence > self.anyway_above_confidence
		{
			if let Some(language) = token.normalization_language.or(token.detected_language) {
				if let Some(algorithm) = get_stemming_algorithm_for_lang(language) {
					let stemmer = Stemmer::create(algorithm);
					token.update_normalized_string(
						stemmer.stem(token.get_text_prefer_normalized()).to_string(),
						Some(language),
					);
				}
			}
		}
		return token;
	}
}

/// Map Whatlang languages to Implemented normalization algorithms
fn get_stemming_algorithm_for_lang(lang: Lang) -> Option<Algorithm> {
	Some(match lang {
		Lang::Ara => Algorithm::Arabic,
		Lang::Dan => Algorithm::Danish,
		Lang::Nld => Algorithm::Dutch,
		Lang::Eng => Algorithm::English,
		Lang::Fin => Algorithm::Finnish,
		Lang::Fra => Algorithm::French,
		Lang::Deu => Algorithm::German,
		Lang::Ell => Algorithm::Greek,
		Lang::Hun => Algorithm::Hungarian,
		Lang::Ita => Algorithm::Italian,
		// Missing: Norwegian, whatlang can't detect it
		Lang::Por => Algorithm::Portuguese,
		Lang::Ron => Algorithm::Romanian,
		Lang::Rus => Algorithm::Russian,
		Lang::Spa => Algorithm::Spanish,
		Lang::Swe => Algorithm::Swedish,
		Lang::Tam => Algorithm::Tamil,
		Lang::Tur => Algorithm::Turkish,
		_ => {
			return None;
		}
	})
}

#[cfg(test)]
mod test {

	use super::*;

	use crate::chain::*;

	use crate::augmentation::AugmentationDetectLanguage;
	use crate::normalization::NormalizationLowercase;
	use crate::segmentation::UnicodeSentenceSplitter;
	use crate::segmentation::UnicodeWordSplitter;

	#[test]
	fn test_stemmed_unicode_word_split() {
		let test_text = "Fischers Fritze fischt frische Fische! The jumping brown fox quickly jumps over the sleeping dog. CAN NOT HANDLE SCREAMING CASE OUTRAGE!";

		let sentence_splitter = UnicodeSentenceSplitter::new();
		let language_detector = AugmentationDetectLanguage::new();
		let word_splitter = UnicodeWordSplitter::new();

		let result: Vec<String> = test_text
			.start_segmentation_chain()
			.chain_segmenter(&sentence_splitter)
			.chain_augmenter(&language_detector)
			.inspect(|x| {
				println!("{x:?}");
			})
			.chain_segmenter(&word_splitter)
			.inspect(|x| {
				println!("word: {x:?}");
			})
			.chain_augmenter(&NormalizationRustStemmers::new().set_anyway_above_confidence(0.1))
			.map(|t| t.get_text_prefer_normalized_owned())
			.collect();

		let expected_tokens: Vec<String> = vec![
			"Fisch",
			" ",
			"Fritz",
			" ",
			"fischt",
			" ",
			"frisch",
			" ",
			"Fisch",
			"!",
			" ",
			"",
			"The",
			" ",
			"jump",
			" ",
			"brown",
			" ",
			"fox",
			" ",
			"quick",
			" ",
			"jump",
			" ",
			"over",
			" ",
			"the",
			" ",
			"sleep",
			" ",
			"dog",
			".",
			" ",
			"",
			"CAN",
			" ",
			"NOT",
			" ",
			"HANDLE",
			" ",
			"SCREAMING",
			" ",
			"CASE",
			" ",
			"OUTRAGE",
			"!",
			"",
		]
		.iter()
		.map(|s| s.to_string())
		.collect();

		assert_eq!(result, expected_tokens);
	}

	#[test]
	fn test_stemmed_unicode_word_split_lowercase() {
		let test_text = "Fischers Fritze fischt frische Fische! The jumping brown fox quickly jumps over the sleeping dog. CAN ALSO HANDLE SCREAMING CASE OUTRAGE!";

		let sentence_splitter = UnicodeSentenceSplitter::new();
		let language_detector = AugmentationDetectLanguage::new();
		let word_splitter = UnicodeWordSplitter::new();

		let result: Vec<String> = test_text
			.start_segmentation_chain()
			.chain_segmenter(&sentence_splitter)
			.chain_augmenter(&language_detector)
			.inspect(|x| {
				println!("{x:?}");
			})
			.chain_segmenter(&word_splitter)
			.inspect(|x| {
				println!("word: {x:?}");
			})
			.chain_augmenter(&NormalizationLowercase::new())
			.chain_augmenter(
				&NormalizationRustStemmers::new()
					.set_anyway_above_confidence(0.1)
					.set_process_already_normalized(true),
			)
			.map(|t| t.get_text_prefer_normalized_owned())
			.collect();

		let expected_tokens: Vec<String> = vec![
			"fisch",
			" ",
			"fritz",
			" ",
			"fischt",
			" ",
			"frisch",
			" ",
			"fisch",
			"!",
			" ",
			"",
			"the",
			" ",
			"jump",
			" ",
			"brown",
			" ",
			"fox",
			" ",
			"quick",
			" ",
			"jump",
			" ",
			"over",
			" ",
			"the",
			" ",
			"sleep",
			" ",
			"dog",
			".",
			" ",
			"",
			"can",
			" ",
			"also",
			" ",
			"handl",
			" ",
			"screaming",
			" ",
			"cas",
			" ",
			"outrag",
			"!",
			"",
		]
		.iter()
		.map(|s| s.to_string())
		.collect();

		assert_eq!(result, expected_tokens);
	}
}