unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

#![allow(clippy::needless_return)] // Explicit returning can improve readability
#![allow(clippy::result_unit_err)] // Better than uing an Option …
#![allow(clippy::to_string_trait_impl)] // Don't want to deal with reimplmenting as `Display` right now.
#![allow(clippy::redundant_field_names)] // Those are okay
#![allow(clippy::tabs_in_doc_comments)] // Tabs are for indentation, period.
#![warn(missing_docs)]

//! Crate that gives you building blocks for putting together text segmentation pipelines.
//!
//! The base unit of data it works with is a [SegmentedToken]. It is based on splitting these into incresingly smaller tokens using [segmenters][segmentation], inbetween adding metadata using [augmenters][augmentation] and finally applying [normalization].
//!
//! All this is based on Rusts builtin Iterator framework, so whenever a "magic" metod from this crate operates on multiple tokens that could be any iterator that happens to contain [SegmentedToken] items and whenever something is returned it can be treated like any other iterator.
//!
//! ```
//! use unobtanium_segmenter::augmentation::AugmentationClassify;
//! use unobtanium_segmenter::augmentation::AugmentationDetectLanguage;
//! use unobtanium_segmenter::chain::ChainAugmenter;
//! use unobtanium_segmenter::chain::ChainSegmenter;
//! use unobtanium_segmenter::chain::StartSegmentationChain;
//! use unobtanium_segmenter::normalization::NormalizationLowercase;
//! use unobtanium_segmenter::normalization::NormalizationRustStemmers;
//! use unobtanium_segmenter::segmentation::UnicodeSentenceSplitter;
//! use unobtanium_segmenter::segmentation::UnicodeWordSplitter;
//! use unobtanium_segmenter::segmentation::LinguaLanguageBlockSentenceSplitter;
//!
//! let sample_text = "The first digits of π are 3.141592. Dieser Satz ist in deutscher Sprache verfasst.";
//!
//! let output: Vec<String> = sample_text
//! 	.start_segmentation_chain() // Text to token iterator
//! 	.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new()) //Hybrid language detection and sentence splitter
//! 	.inspect(|t| println!("{t:?}")) // Debug helper
//! 	.chain_segmenter(&UnicodeWordSplitter::new())
//! 	.chain_augmenter(&AugmentationClassify::new()) // adds useful metadata and speeds up stemming
//! 	.chain_augmenter(&NormalizationRustStemmers::new())
//! 	.chain_augmenter(&NormalizationLowercase::new())
//! 	.map(|t| t.get_text_prefer_normalized_owned()) // token to text mapping
//! 	.collect();
//!
//! let expected_output: Vec<String> = vec![
//! 	"the", " ", "first", " ", "digit", " ", "of", " ", "π", " ", "are", " ", "3.141592", ".", " ", "" /* End of sentence marker */,
//! 	"dies", " ", "satz", " ", "ist", " ", "in", " ", "deutsch", " ", "sprach", " ", "verfasst", ".", "" /* End of sentence marker */,
//! ].iter().map(|s| s.to_string()).collect();;
//!
//! assert_eq!(output, expected_output);
//! ```

mod initial_paragraph_splitter;
mod language_detection;
mod parser_iterator;
mod segmented_token;
mod sentence_grouped_iterator;
mod subdivision_map;

pub mod augmentation;
pub mod chain;
pub mod normalization;
pub mod segmentation;

pub use segmented_token::NormalizedText;
pub use segmented_token::SegmentedToken;
pub use segmented_token::SegmentedTokenKind;
pub use sentence_grouped_iterator::SentenceGroupedIterator;
pub use subdivision_map::SubdivisionMap;
pub use subdivision_map::UseOrSubdivide;

#[cfg(test)]
mod tests {
	use crate::augmentation::AugmentationClassify;
	use crate::augmentation::AugmentationDetectLanguage;
	use crate::chain::ChainAugmenter;
	use crate::chain::ChainSegmenter;
	use crate::chain::StartSegmentationChain;
	use crate::normalization::NormalizationLowercase;
	use crate::normalization::NormalizationRustStemmers;
	use crate::segmentation::UnicodeSentenceSplitter;
	use crate::segmentation::UnicodeWordSplitter;

	/// This used to be the main example doctest, but it has been replaced by showing what lingua can do instead.
	#[test]
	fn end_to_end_test_one() {
		let sample_text =
			"The first digits of π are 3.141592. Dieser Satz ist in deutscher Sprache verfasst.";

		let output: Vec<String> = sample_text
			.start_segmentation_chain() // Text to token iterator
			.chain_segmenter(&UnicodeSentenceSplitter::new())
			.chain_augmenter(&AugmentationDetectLanguage::new())
			.inspect(|t| println!("{t:?}")) // Debug helper
			.chain_segmenter(&UnicodeWordSplitter::new())
			.chain_augmenter(&AugmentationClassify::new()) // adds useful metadata and speeds up stemming
			.chain_augmenter(&NormalizationRustStemmers::new())
			.chain_augmenter(&NormalizationLowercase::new())
			.map(|t| t.get_text_prefer_normalized_owned()) // token to text mapping
			.collect();

		let expected_output: Vec<String> = vec![
			"the", " ", "first", " ", "digit", " ", "of", " ", "π", " ", "are", " ", "3.141592",
			".", " ", "", "dies", " ", "satz", " ", "ist", " ", "in", " ", "deutsch", " ",
			"sprach", " ", "verfasst", ".", "",
		]
		.iter()
		.map(|s| s.to_string())
		.collect();

		assert_eq!(output, expected_output);
	}
}