Skip to main content

unobtanium_segmenter/
lib.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5#![allow(clippy::needless_return)] // Explicit returning can improve readability
6#![allow(clippy::result_unit_err)] // Better than uing an Option …
7#![allow(clippy::to_string_trait_impl)] // Don't want to deal with reimplmenting as `Display` right now.
8#![allow(clippy::redundant_field_names)] // Those are okay
9#![allow(clippy::tabs_in_doc_comments)] // Tabs are for indentation, period.
10#![warn(missing_docs)]
11
12//! Crate that gives you building blocks for putting together text segmentation pipelines.
13//!
14//! The base unit of data it works with is a [SegmentedToken]. It is based on splitting these into incresingly smaller tokens using [segmenters][segmentation], inbetween adding metadata using [augmenters][augmentation] and finally applying [normalization].
15//!
16//! All this is based on Rusts builtin Iterator framework, so whenever a "magic" metod from this crate operates on multiple tokens that could be any iterator that happens to contain [SegmentedToken] items and whenever something is returned it can be treated like any other iterator.
17//!
18//! ```
19//! use unobtanium_segmenter::augmentation::AugmentationClassify;
20//! use unobtanium_segmenter::augmentation::AugmentationDetectLanguage;
21//! use unobtanium_segmenter::chain::ChainAugmenter;
22//! use unobtanium_segmenter::chain::ChainSegmenter;
23//! use unobtanium_segmenter::chain::StartSegmentationChain;
24//! use unobtanium_segmenter::normalization::NormalizationLowercase;
25//! use unobtanium_segmenter::normalization::NormalizationRustStemmers;
26//! use unobtanium_segmenter::segmentation::UnicodeSentenceSplitter;
27//! use unobtanium_segmenter::segmentation::UnicodeWordSplitter;
28//! use unobtanium_segmenter::segmentation::LinguaLanguageBlockSentenceSplitter;
29//!
30//! let sample_text = "The first digits of π are 3.141592. Dieser Satz ist in deutscher Sprache verfasst.";
31//!
32//! let output: Vec<String> = sample_text
33//! 	.start_segmentation_chain() // Text to token iterator
34//! 	.chain_segmenter(&LinguaLanguageBlockSentenceSplitter::new()) //Hybrid language detection and sentence splitter
35//! 	.inspect(|t| println!("{t:?}")) // Debug helper
36//! 	.chain_segmenter(&UnicodeWordSplitter::new())
37//! 	.chain_augmenter(&AugmentationClassify::new()) // adds useful metadata and speeds up stemming
38//! 	.chain_augmenter(&NormalizationRustStemmers::new())
39//! 	.chain_augmenter(&NormalizationLowercase::new())
40//! 	.map(|t| t.get_text_prefer_normalized_owned()) // token to text mapping
41//! 	.collect();
42//!
43//! let expected_output: Vec<String> = vec![
44//! 	"the", " ", "first", " ", "digit", " ", "of", " ", "π", " ", "are", " ", "3.141592", ".", " ", "" /* End of sentence marker */,
45//! 	"dies", " ", "satz", " ", "ist", " ", "in", " ", "deutsch", " ", "sprach", " ", "verfasst", ".", "" /* End of sentence marker */,
46//! ].iter().map(|s| s.to_string()).collect();;
47//!
48//! assert_eq!(output, expected_output);
49//! ```
50
51mod initial_paragraph_splitter;
52mod language_detection;
53mod parser_iterator;
54mod segmented_token;
55mod sentence_grouped_iterator;
56mod subdivision_map;
57
58pub mod augmentation;
59pub mod chain;
60pub mod normalization;
61pub mod segmentation;
62
63pub use segmented_token::NormalizedText;
64pub use segmented_token::SegmentedToken;
65pub use segmented_token::SegmentedTokenKind;
66pub use sentence_grouped_iterator::SentenceGroupedIterator;
67pub use subdivision_map::SubdivisionMap;
68pub use subdivision_map::UseOrSubdivide;
69
70#[cfg(test)]
71mod tests {
72	use crate::augmentation::AugmentationClassify;
73	use crate::augmentation::AugmentationDetectLanguage;
74	use crate::chain::ChainAugmenter;
75	use crate::chain::ChainSegmenter;
76	use crate::chain::StartSegmentationChain;
77	use crate::normalization::NormalizationLowercase;
78	use crate::normalization::NormalizationRustStemmers;
79	use crate::segmentation::UnicodeSentenceSplitter;
80	use crate::segmentation::UnicodeWordSplitter;
81
82	/// This used to be the main example doctest, but it has been replaced by showing what lingua can do instead.
83	#[test]
84	fn end_to_end_test_one() {
85		let sample_text =
86			"The first digits of π are 3.141592. Dieser Satz ist in deutscher Sprache verfasst.";
87
88		let output: Vec<String> = sample_text
89			.start_segmentation_chain() // Text to token iterator
90			.chain_segmenter(&UnicodeSentenceSplitter::new())
91			.chain_augmenter(&AugmentationDetectLanguage::new())
92			.inspect(|t| println!("{t:?}")) // Debug helper
93			.chain_segmenter(&UnicodeWordSplitter::new())
94			.chain_augmenter(&AugmentationClassify::new()) // adds useful metadata and speeds up stemming
95			.chain_augmenter(&NormalizationRustStemmers::new())
96			.chain_augmenter(&NormalizationLowercase::new())
97			.map(|t| t.get_text_prefer_normalized_owned()) // token to text mapping
98			.collect();
99
100		let expected_output: Vec<String> = vec![
101			"the", " ", "first", " ", "digit", " ", "of", " ", "π", " ", "are", " ", "3.141592",
102			".", " ", "", "dies", " ", "satz", " ", "ist", " ", "in", " ", "deutsch", " ",
103			"sprach", " ", "verfasst", ".", "",
104		]
105		.iter()
106		.map(|s| s.to_string())
107		.collect();
108
109		assert_eq!(output, expected_output);
110	}
111}