#![allow(clippy::needless_return)] #![allow(clippy::result_unit_err)] #![allow(clippy::to_string_trait_impl)] #![allow(clippy::redundant_field_names)] #![allow(clippy::tabs_in_doc_comments)] #![warn(missing_docs)]
mod initial_paragraph_splitter;
mod language_detection;
mod parser_iterator;
mod segmented_token;
mod sentence_grouped_iterator;
mod subdivision_map;
pub mod augmentation;
pub mod chain;
pub mod normalization;
pub mod segmentation;
pub use segmented_token::NormalizedText;
pub use segmented_token::SegmentedToken;
pub use segmented_token::SegmentedTokenKind;
pub use sentence_grouped_iterator::SentenceGroupedIterator;
pub use subdivision_map::SubdivisionMap;
pub use subdivision_map::UseOrSubdivide;
#[cfg(test)]
mod tests {
use crate::augmentation::AugmentationClassify;
use crate::augmentation::AugmentationDetectLanguage;
use crate::chain::ChainAugmenter;
use crate::chain::ChainSegmenter;
use crate::chain::StartSegmentationChain;
use crate::normalization::NormalizationLowercase;
use crate::normalization::NormalizationRustStemmers;
use crate::segmentation::UnicodeSentenceSplitter;
use crate::segmentation::UnicodeWordSplitter;
#[test]
fn end_to_end_test_one() {
let sample_text =
"The first digits of π are 3.141592. Dieser Satz ist in deutscher Sprache verfasst.";
let output: Vec<String> = sample_text
.start_segmentation_chain() .chain_segmenter(&UnicodeSentenceSplitter::new())
.chain_augmenter(&AugmentationDetectLanguage::new())
.inspect(|t| println!("{t:?}")) .chain_segmenter(&UnicodeWordSplitter::new())
.chain_augmenter(&AugmentationClassify::new()) .chain_augmenter(&NormalizationRustStemmers::new())
.chain_augmenter(&NormalizationLowercase::new())
.map(|t| t.get_text_prefer_normalized_owned()) .collect();
let expected_output: Vec<String> = vec![
"the", " ", "first", " ", "digit", " ", "of", " ", "π", " ", "are", " ", "3.141592",
".", " ", "", "dies", " ", "satz", " ", "ist", " ", "in", " ", "deutsch", " ",
"sprach", " ", "verfasst", ".", "",
]
.iter()
.map(|s| s.to_string())
.collect();
assert_eq!(output, expected_output);
}
}