unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use unicode_properties::{GeneralCategoryGroup, UnicodeGeneralCategory};

use crate::SegmentedToken;
use crate::SegmentedTokenKind;
use crate::parser_iterator::ParserIterator;

/// A segmenting iterator for near the start of a segmentation chain.
///
/// Note: It is no longer used for its initial purpose and now part of the NaiveWordSplitter.
///
/// One should give it roughly a paragraph of text.
pub struct InitialParagraphSplitter<'a> {
	text: &'a str,
	iterator: ParserIterator<'a>,
}

impl<'a> InitialParagraphSplitter<'a> {
	/// Create a new InitialParagraphSplitter from a piece of text.
	pub fn new(text: &'a str) -> Self {
		Self {
			text,
			iterator: ParserIterator::new(text),
		}
	}
}

impl<'a> Iterator for InitialParagraphSplitter<'a> {
	type Item = SegmentedToken<'a>;

	fn next(&mut self) -> Option<Self::Item> {
		let initial_character_category_group = self.iterator.peek()?.general_category_group();
		let start_index = self.iterator.index();
		let token_kind;
		match initial_character_category_group {
			GeneralCategoryGroup::Letter
			| GeneralCategoryGroup::Number
			| GeneralCategoryGroup::Mark => {
				token_kind = SegmentedTokenKind::AlphaNumeric;
				self.iterator.consume_chars(|c| {
					matches!(
						c.general_category_group(),
						GeneralCategoryGroup::Letter
							| GeneralCategoryGroup::Number
							| GeneralCategoryGroup::Mark
					)
				});
				// TODO: Match on things like number formatting and
				// maybe underscore connected words for tokenizing code
			}
			GeneralCategoryGroup::Punctuation
			| GeneralCategoryGroup::Separator
			| GeneralCategoryGroup::Other => {
				// TODO:
				// Seperate out things like private-use characters, etc …
				token_kind = SegmentedTokenKind::Separator;
				self.iterator.consume_chars(|c| {
					matches!(
						c.general_category_group(),
						GeneralCategoryGroup::Punctuation
							| GeneralCategoryGroup::Separator
							| GeneralCategoryGroup::Other
					)
				});
			}
			GeneralCategoryGroup::Symbol => {
				token_kind = SegmentedTokenKind::Symbol;
				self.iterator.consume_chars(|c| {
					matches!(c.general_category_group(), GeneralCategoryGroup::Symbol)
						|| c == '\u{200d}'
				});
			}
		}
		let end_index = self.iterator.index();
		if start_index == end_index {
			return None;
		}
		return Some(SegmentedToken::new(
			self.text.get(start_index..end_index)?,
			Some(token_kind),
		));
	}
}

#[cfg(test)]
mod test {

	use super::*;

	#[test]
	fn test_initial_segmentation() {
		let segmenter =
			InitialParagraphSplitter::new("The quick brown 🦊fox jumps over the lazy 🐶dog.");
		let result: Vec<&str> = segmenter.map(|s| s.text).collect();
		assert_eq!(
			result,
			vec![
				"The", " ", "quick", " ", "brown", " ", "🦊", "fox", " ", "jumps", " ", "over",
				" ", "the", " ", "lazy", " ", "🐶", "dog", "."
			]
		);

		let segmenter = InitialParagraphSplitter::new("👪 👩‍👩‍👧");
		let result: Vec<&str> = segmenter.map(|s| s.text).collect();
		assert_eq!(result, vec!["👪", " ", "👩‍👩‍👧"]);
	}
}