unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use unicode_properties::GeneralCategoryGroup;
use unicode_properties::UnicodeGeneralCategory;

use crate::SegmentedToken;
use crate::SegmentedTokenKind;
use crate::augmentation::Augmenter;

/// An augmenter that rewrites the [SegmentedToken::kind] field to match reality.
///
/// It does so by reading the token text (preferring the normalized text)
/// and applying heuristics based on the unicode [GeneralCategoryGroup] of the
/// characters it contains.
///
/// The following heuristics are applied in the given order:
///
/// 1. If it contains **Letters** or **Numbers** -> [SegmentedTokenKind::AlphaNumeric]
/// 2. If it contains **Symbols** or **Other** -> [SegmentedTokenKind::Symbol]
/// 3. If it contains **Punctuation** or **Separators** -> [SegmentedTokenKind::Separator]
///
/// Exceptions from usual unicode classification: `\n` and `\0` are seperators.
///
/// The **Mark** category is ignored. If none of the heuristics apply the token kind is reset to `None`.
#[derive(Debug, Clone, Default)]
pub struct AugmentationClassify {}

impl AugmentationClassify {
	/// Create a new classify augmenter with default settings.
	pub fn new() -> Self {
		Default::default()
	}
}

impl Augmenter for AugmentationClassify {
	fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
		let mut has_seperators = false;
		let mut has_symbols = false;
		for c in token.get_text_prefer_normalized().chars() {
			match c.general_category_group() {
				GeneralCategoryGroup::Letter | GeneralCategoryGroup::Number => {
					token.kind = Some(SegmentedTokenKind::AlphaNumeric);
					return token;
				}
				GeneralCategoryGroup::Punctuation | GeneralCategoryGroup::Separator => {
					has_seperators = true
				}
				GeneralCategoryGroup::Symbol | GeneralCategoryGroup::Other => match c {
					'\n' | '\0' => has_seperators = true,
					_ => has_symbols = true,
				},
				GeneralCategoryGroup::Mark => { /* ignore */ }
			}
		}
		if has_symbols {
			token.kind = Some(SegmentedTokenKind::Symbol);
			return token;
		}
		if has_seperators {
			token.kind = Some(SegmentedTokenKind::Separator);
			return token;
		}
		token.kind = None;
		return token;
	}
}

#[cfg(test)]
mod test {

	use super::*;

	use crate::chain::ChainAugmenter;
	use crate::chain::ChainSegmenter;
	use crate::chain::StartSegmentationChain;
	use crate::segmentation::UnicodeWordSplitter;

	fn a() -> Option<SegmentedTokenKind> {
		Some(SegmentedTokenKind::AlphaNumeric)
	}

	fn s() -> Option<SegmentedTokenKind> {
		Some(SegmentedTokenKind::Separator)
	}

	fn y() -> Option<SegmentedTokenKind> {
		Some(SegmentedTokenKind::Symbol)
	}

	#[test]
	fn test_unicode_word_split() {
		let test_text = "The quick (\"brown\") fox🦊 can't jump 32.3 feet, right?\nThe quick (\"brown\")  fox. The value of π in german is '3,141592…'.";

		let word_splitter = UnicodeWordSplitter::new();
		let classifier = AugmentationClassify::new();

		let result: Vec<(&str, Option<SegmentedTokenKind>)> = test_text
			.start_segmentation_chain()
			.chain_segmenter(&word_splitter)
			.chain_augmenter(&classifier)
			.map(|t| (t.text, t.kind))
			.collect();

		let expected_tokens = vec![
			("The", a()),
			(" ", s()),
			("quick", a()),
			(" ", s()),
			("(", s()),
			("\"", s()),
			("brown", a()),
			("\"", s()),
			(")", s()),
			(" ", s()),
			("fox", a()),
			("🦊", y()),
			(" ", s()),
			("can't", a()),
			(" ", s()),
			("jump", a()),
			(" ", s()),
			("32.3", a()),
			(" ", s()),
			("feet", a()),
			(",", s()),
			(" ", s()),
			("right", a()),
			("?", s()),
			("\n", s()),
			("The", a()),
			(" ", s()),
			("quick", a()),
			(" ", s()),
			("(", s()),
			("\"", s()),
			("brown", a()),
			("\"", s()),
			(")", s()),
			("  ", s()),
			("fox", a()),
			(".", s()),
			(" ", s()),
			("The", a()),
			(" ", s()),
			("value", a()),
			(" ", s()),
			("of", a()),
			(" ", s()),
			("Ï€", a()),
			(" ", s()),
			("in", a()),
			(" ", s()),
			("german", a()),
			(" ", s()),
			("is", a()),
			(" ", s()),
			("'", s()),
			("3,141592", a()),
			("…", s()),
			("'", s()),
			(".", s()),
		];

		assert_eq!(result, expected_tokens);
	}
}