unobtanium_segmenter/
lib.rs

1#![allow(clippy::needless_return)] // Explicit returning can improve readability
2#![allow(clippy::result_unit_err)] // Better than uing an Option …
3#![allow(clippy::to_string_trait_impl)] // Don't want to deal with reimplmenting as `Display` right now.
4#![allow(clippy::redundant_field_names)] // Those are okay
5#![allow(clippy::tabs_in_doc_comments)] // Tabs are for indentation, period.
6#![warn(missing_docs)]
7
8//! Crate that gives you building blocks for putting together text segmentation pipelines.
9//!
10//! The base unit of data it works with is a [SegmentedToken]. It is based on splitting these into incresingly smaller tokens using [segmenters][segmentation], inbetween adding metadata using [augmenters][augmentation] and finally applying [normalization].
11//!
12//! All this is based on Rusts builtin Iterator framework, so whenever a "magic" metod from this crate operates on multiple tokens that could be any iterator that happens to contain [SegmentedToken] items and whenever something is returned it can be treated like any other iterator.
13//!
14//! ```
15//! use unobtanium_segmenter::augmentation::AugmentationDetectLanguage;
16//! use unobtanium_segmenter::chain::ChainAugmenter;
17//! use unobtanium_segmenter::chain::ChainSegmenter;
18//! use unobtanium_segmenter::chain::StartSegmentationChain;
19//! use unobtanium_segmenter::normalization::NormalizationLowercase;
20//! use unobtanium_segmenter::normalization::NormalizationRustStemmers;
21//! use unobtanium_segmenter::segmentation::UnicodeSentenceSplitter;
22//! use unobtanium_segmenter::segmentation::UnicodeWordSplitter;
23//!
24//! let sample_text = "The first digits of π are 3.141592. Dieser Satz ist in deutscher Sprache verfasst.";
25//!
26//! let output: Vec<String> = sample_text
27//! 	.start_segmentation_chain() // Text to token iterator
28//! 	.chain_segmenter(&UnicodeSentenceSplitter::new())
29//! 	.chain_augmenter(&AugmentationDetectLanguage::new())
30//! 	.inspect(|t| println!("{t:?}")) // Debug helper
31//! 	.chain_segmenter(&UnicodeWordSplitter::new())
32//! 	.chain_augmenter(&NormalizationRustStemmers::new())
33//! 	.chain_augmenter(&NormalizationLowercase::new())
34//! 	.map(|t| t.get_text_prefer_normalized_owned()) // token to text mapping
35//! 	.collect();
36//!
37//! let expected_output: Vec<String> = vec![
38//! 	"the", " ", "first", " ", "digit", " ", "of", " ", "π", " ", "are", " ", "3.141592", ".", " ",
39//! 	"dies", " ", "satz", " ", "ist", " ", "in", " ", "deutsch", " ", "sprach", " ", "verfasst", "."
40//! ].iter().map(|s| s.to_string()).collect();;
41//!
42//! assert_eq!(output, expected_output);
43//! ```
44
45pub mod augmentation;
46pub mod chain;
47mod initial_paragraph_splitter;
48pub mod normalization;
49mod parser_iterator;
50pub mod segmentation;
51mod segmented_token;
52mod subdivision_map;
53
54pub use segmented_token::SegmentedToken;
55pub use segmented_token::SegmentedTokenKind;
56pub use subdivision_map::SubdivisionMap;
57pub use subdivision_map::UseOrSubdivide;