unobtanium_segmenter/lib.rs
1#![allow(clippy::needless_return)] // Explicit returning can improve readability
2#![allow(clippy::result_unit_err)] // Better than uing an Option …
3#![allow(clippy::to_string_trait_impl)] // Don't want to deal with reimplmenting as `Display` right now.
4#![allow(clippy::redundant_field_names)] // Those are okay
5#![allow(clippy::tabs_in_doc_comments)] // Tabs are for indentation, period.
6#![warn(missing_docs)]
7
8//! Crate that gives you building blocks for putting together text segmentation pipelines.
9//!
10//! The base unit of data it works with is a [SegmentedToken]. It is based on splitting these into incresingly smaller tokens using [segmenters][segmentation], inbetween adding metadata using [augmenters][augmentation] and finally applying [normalization].
11//!
12//! All this is based on Rusts builtin Iterator framework, so whenever a "magic" metod from this crate operates on multiple tokens that could be any iterator that happens to contain [SegmentedToken] items and whenever something is returned it can be treated like any other iterator.
13//!
14//! ```
15//! use unobtanium_segmenter::augmentation::AugmentationClassify;
16//! use unobtanium_segmenter::augmentation::AugmentationDetectLanguage;
17//! use unobtanium_segmenter::chain::ChainAugmenter;
18//! use unobtanium_segmenter::chain::ChainSegmenter;
19//! use unobtanium_segmenter::chain::StartSegmentationChain;
20//! use unobtanium_segmenter::normalization::NormalizationLowercase;
21//! use unobtanium_segmenter::normalization::NormalizationRustStemmers;
22//! use unobtanium_segmenter::segmentation::UnicodeSentenceSplitter;
23//! use unobtanium_segmenter::segmentation::UnicodeWordSplitter;
24//!
25//! let sample_text = "The first digits of π are 3.141592. Dieser Satz ist in deutscher Sprache verfasst.";
26//!
27//! let output: Vec<String> = sample_text
28//! .start_segmentation_chain() // Text to token iterator
29//! .chain_segmenter(&UnicodeSentenceSplitter::new())
30//! .chain_augmenter(&AugmentationDetectLanguage::new())
31//! .inspect(|t| println!("{t:?}")) // Debug helper
32//! .chain_segmenter(&UnicodeWordSplitter::new())
33//! .chain_augmenter(&AugmentationClassify::new()) // adds useful metadata and speeds up stemming
34//! .chain_augmenter(&NormalizationRustStemmers::new())
35//! .chain_augmenter(&NormalizationLowercase::new())
36//! .map(|t| t.get_text_prefer_normalized_owned()) // token to text mapping
37//! .collect();
38//!
39//! let expected_output: Vec<String> = vec![
40//! "the", " ", "first", " ", "digit", " ", "of", " ", "π", " ", "are", " ", "3.141592", ".", " ", "",
41//! "dies", " ", "satz", " ", "ist", " ", "in", " ", "deutsch", " ", "sprach", " ", "verfasst", ".", ""
42//! ].iter().map(|s| s.to_string()).collect();;
43//!
44//! assert_eq!(output, expected_output);
45//! ```
46
47mod initial_paragraph_splitter;
48mod parser_iterator;
49mod segmented_token;
50mod sentence_grouped_iterator;
51mod subdivision_map;
52
53pub mod augmentation;
54pub mod chain;
55pub mod normalization;
56pub mod segmentation;
57
58pub use segmented_token::SegmentedToken;
59pub use segmented_token::SegmentedTokenKind;
60pub use sentence_grouped_iterator::SentenceGroupedIterator;
61pub use subdivision_map::SubdivisionMap;
62pub use subdivision_map::UseOrSubdivide;