use unobtanium_segmenter::augmentation::AugmentationClassify;
use unobtanium_segmenter::augmentation::AugmentationDetectLanguage;
use unobtanium_segmenter::chain::ChainAugmenter;
use unobtanium_segmenter::chain::ChainSegmenter;
use unobtanium_segmenter::chain::StartSegmentationChain;
use unobtanium_segmenter::normalization::NormalizationLowercase;
use unobtanium_segmenter::normalization::NormalizationRustStemmers;
use unobtanium_segmenter::segmentation::UnicodeSentenceSplitter;
use unobtanium_segmenter::segmentation::UnicodeWordSplitter;
use unobtanium_segmenter::SegmentedToken;
use unobtanium_segmenter::SegmentedTokenKind;
pub fn segment_unfiltered(text: &str) -> impl Iterator<Item=SegmentedToken> {
let unicode_sentence_splitter = UnicodeSentenceSplitter::new();
let augmentation_detect_language = AugmentationDetectLanguage::new();
let unicode_word_splitter = UnicodeWordSplitter::new();
let augmentation_classifier = AugmentationClassify::new();
let normalization_rust_stemmers = NormalizationRustStemmers::new();
let normalization_lowercase = NormalizationLowercase::new();
text.start_segmentation_chain()
.chain_owned_augmenter(augmentation_detect_language.clone())
.chain_owned_segmenter(unicode_sentence_splitter)
.chain_owned_augmenter(augmentation_detect_language)
.chain_owned_segmenter(unicode_word_splitter)
.chain_owned_augmenter(augmentation_classifier)
.chain_owned_augmenter(normalization_rust_stemmers)
.chain_owned_augmenter(normalization_lowercase)
}
pub fn segment(text: &str) -> impl Iterator<Item=SegmentedToken> {
segment_unfiltered(text)
.filter(segment_filter)
}
pub fn segment_filter(token: &SegmentedToken) -> bool {
token.kind == Some(SegmentedTokenKind::AlphaNumeric) && token.get_text_prefer_normalized().len() < 512
}