unobtanium_segmenter/
lib.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5#![allow(clippy::needless_return)] // Explicit returning can improve readability
6#![allow(clippy::result_unit_err)] // Better than uing an Option …
7#![allow(clippy::to_string_trait_impl)] // Don't want to deal with reimplmenting as `Display` right now.
8#![allow(clippy::redundant_field_names)] // Those are okay
9#![allow(clippy::tabs_in_doc_comments)] // Tabs are for indentation, period.
10#![warn(missing_docs)]
11
12//! Crate that gives you building blocks for putting together text segmentation pipelines.
13//!
14//! The base unit of data it works with is a [SegmentedToken]. It is based on splitting these into incresingly smaller tokens using [segmenters][segmentation], inbetween adding metadata using [augmenters][augmentation] and finally applying [normalization].
15//!
16//! All this is based on Rusts builtin Iterator framework, so whenever a "magic" metod from this crate operates on multiple tokens that could be any iterator that happens to contain [SegmentedToken] items and whenever something is returned it can be treated like any other iterator.
17//!
18//! ```
19//! use unobtanium_segmenter::augmentation::AugmentationClassify;
20//! use unobtanium_segmenter::augmentation::AugmentationDetectLanguage;
21//! use unobtanium_segmenter::chain::ChainAugmenter;
22//! use unobtanium_segmenter::chain::ChainSegmenter;
23//! use unobtanium_segmenter::chain::StartSegmentationChain;
24//! use unobtanium_segmenter::normalization::NormalizationLowercase;
25//! use unobtanium_segmenter::normalization::NormalizationRustStemmers;
26//! use unobtanium_segmenter::segmentation::UnicodeSentenceSplitter;
27//! use unobtanium_segmenter::segmentation::UnicodeWordSplitter;
28//!
29//! let sample_text = "The first digits of π are 3.141592. Dieser Satz ist in deutscher Sprache verfasst.";
30//!
31//! let output: Vec<String> = sample_text
32//! 	.start_segmentation_chain() // Text to token iterator
33//! 	.chain_segmenter(&UnicodeSentenceSplitter::new())
34//! 	.chain_augmenter(&AugmentationDetectLanguage::new())
35//! 	.inspect(|t| println!("{t:?}")) // Debug helper
36//! 	.chain_segmenter(&UnicodeWordSplitter::new())
37//! 	.chain_augmenter(&AugmentationClassify::new()) // adds useful metadata and speeds up stemming
38//! 	.chain_augmenter(&NormalizationRustStemmers::new())
39//! 	.chain_augmenter(&NormalizationLowercase::new())
40//! 	.map(|t| t.get_text_prefer_normalized_owned()) // token to text mapping
41//! 	.collect();
42//!
43//! let expected_output: Vec<String> = vec![
44//! 	"the", " ", "first", " ", "digit", " ", "of", " ", "π", " ", "are", " ", "3.141592", ".", " ", "",
45//! 	"dies", " ", "satz", " ", "ist", " ", "in", " ", "deutsch", " ", "sprach", " ", "verfasst", ".", ""
46//! ].iter().map(|s| s.to_string()).collect();;
47//!
48//! assert_eq!(output, expected_output);
49//! ```
50
51mod initial_paragraph_splitter;
52mod parser_iterator;
53mod segmented_token;
54mod sentence_grouped_iterator;
55mod subdivision_map;
56
57pub mod augmentation;
58pub mod chain;
59pub mod normalization;
60pub mod segmentation;
61
62pub use segmented_token::NormalizedText;
63pub use segmented_token::SegmentedToken;
64pub use segmented_token::SegmentedTokenKind;
65pub use sentence_grouped_iterator::SentenceGroupedIterator;
66pub use subdivision_map::SubdivisionMap;
67pub use subdivision_map::UseOrSubdivide;