use unicode_properties::GeneralCategoryGroup;
use unicode_properties::UnicodeGeneralCategory;
use crate::SegmentedToken;
use crate::SegmentedTokenKind;
use crate::augmentation::Augmenter;
#[derive(Debug, Clone, Default)]
pub struct AugmentationClassify {}
impl AugmentationClassify {
pub fn new() -> Self {
Default::default()
}
}
impl Augmenter for AugmentationClassify {
fn augment<'a>(&self, mut token: SegmentedToken<'a>) -> SegmentedToken<'a> {
let mut has_seperators = false;
let mut has_symbols = false;
for c in token.get_text_prefer_normalized().chars() {
match c.general_category_group() {
GeneralCategoryGroup::Letter | GeneralCategoryGroup::Number => {
token.kind = Some(SegmentedTokenKind::AlphaNumeric);
return token;
}
GeneralCategoryGroup::Punctuation | GeneralCategoryGroup::Separator => {
has_seperators = true
}
GeneralCategoryGroup::Symbol | GeneralCategoryGroup::Other => match c {
'\n' | '\0' => has_seperators = true,
_ => has_symbols = true,
},
GeneralCategoryGroup::Mark => { }
}
}
if has_symbols {
token.kind = Some(SegmentedTokenKind::Symbol);
return token;
}
if has_seperators {
token.kind = Some(SegmentedTokenKind::Separator);
return token;
}
token.kind = None;
return token;
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::chain::ChainAugmenter;
use crate::chain::ChainSegmenter;
use crate::chain::StartSegmentationChain;
use crate::segmentation::UnicodeWordSplitter;
fn a() -> Option<SegmentedTokenKind> {
Some(SegmentedTokenKind::AlphaNumeric)
}
fn s() -> Option<SegmentedTokenKind> {
Some(SegmentedTokenKind::Separator)
}
fn y() -> Option<SegmentedTokenKind> {
Some(SegmentedTokenKind::Symbol)
}
#[test]
fn test_unicode_word_split() {
let test_text = "The quick (\"brown\") fox🦊 can't jump 32.3 feet, right?\nThe quick (\"brown\") fox. The value of π in german is '3,141592…'.";
let word_splitter = UnicodeWordSplitter::new();
let classifier = AugmentationClassify::new();
let result: Vec<(&str, Option<SegmentedTokenKind>)> = test_text
.start_segmentation_chain()
.chain_segmenter(&word_splitter)
.chain_augmenter(&classifier)
.map(|t| (t.text, t.kind))
.collect();
let expected_tokens = vec![
("The", a()),
(" ", s()),
("quick", a()),
(" ", s()),
("(", s()),
("\"", s()),
("brown", a()),
("\"", s()),
(")", s()),
(" ", s()),
("fox", a()),
("🦊", y()),
(" ", s()),
("can't", a()),
(" ", s()),
("jump", a()),
(" ", s()),
("32.3", a()),
(" ", s()),
("feet", a()),
(",", s()),
(" ", s()),
("right", a()),
("?", s()),
("\n", s()),
("The", a()),
(" ", s()),
("quick", a()),
(" ", s()),
("(", s()),
("\"", s()),
("brown", a()),
("\"", s()),
(")", s()),
(" ", s()),
("fox", a()),
(".", s()),
(" ", s()),
("The", a()),
(" ", s()),
("value", a()),
(" ", s()),
("of", a()),
(" ", s()),
("Ï€", a()),
(" ", s()),
("in", a()),
(" ", s()),
("german", a()),
(" ", s()),
("is", a()),
(" ", s()),
("'", s()),
("3,141592", a()),
("…", s()),
("'", s()),
(".", s()),
];
assert_eq!(result, expected_tokens);
}
}