unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use unicode_segmentation::UnicodeSegmentation;

use std::vec::IntoIter;

use crate::SegmentedToken;
use crate::UseOrSubdivide;

use crate::segmentation::Segmenter;

/// Split Large chunks of text into sentances according to the Unicode definition of a Sentence.
///
/// Uses [[UnicodeSegmentation::split_sentence_bounds]] under the hood.
#[derive(Debug, Clone, Default)]
pub struct UnicodeSentenceSplitter {}

impl UnicodeSentenceSplitter {
	/// Create a new UnicodeSentenceSplitter instance
	pub fn new() -> Self {
		Default::default()
	}
}

impl Segmenter for UnicodeSentenceSplitter {
	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;

	fn subdivide<'a>(
		&self,
		token: SegmentedToken<'a>,
	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
		let sentence_iterator = token.text.split_sentence_bounds();
		let mut collection = Vec::new();
		for sentence in sentence_iterator {
			let (main, tail) = sentence.split_at(sentence.len());
			collection.push(SegmentedToken::new_derived_from(main, &token));
			collection.push(SegmentedToken::new_end_of_sentence(tail));
		}
		if collection.len() > 1 {
			return UseOrSubdivide::Subdivide(collection.into_iter());
		} else {
			return UseOrSubdivide::Use(token);
		}
	}
}

#[cfg(test)]
mod test {

	use super::*;

	use crate::chain::ChainSegmenter;
	use crate::chain::StartSegmentationChain;

	#[test]
	fn test_unicode_sentence_split() {
		let test_text = "The quick (\"brown\") fox can't jump 32.3 feet, right?\nThe quick (\"brown\")  fox. The value of π in german is '3,141592…'.";

		let result: Vec<&str> = test_text
			.start_segmentation_chain()
			.chain_segmenter(&UnicodeSentenceSplitter::new())
			.map(|t| t.text)
			.collect();

		let expected_tokens = vec![
			"The quick (\"brown\") fox can't jump 32.3 feet, right?\n",
			"",
			"The quick (\"brown\")  fox. ",
			"",
			"The value of π in german is '3,141592…'.",
			"",
		];

		assert_eq!(result, expected_tokens);
	}
}