use unicode_properties::{GeneralCategoryGroup, UnicodeGeneralCategory};
use crate::SegmentedToken;
use crate::SegmentedTokenKind;
use crate::parser_iterator::ParserIterator;
pub struct InitialParagraphSplitter<'a> {
text: &'a str,
iterator: ParserIterator<'a>,
}
impl<'a> InitialParagraphSplitter<'a> {
pub fn new(text: &'a str) -> Self {
Self {
text,
iterator: ParserIterator::new(text),
}
}
}
impl<'a> Iterator for InitialParagraphSplitter<'a> {
type Item = SegmentedToken<'a>;
fn next(&mut self) -> Option<Self::Item> {
let initial_character_category_group = self.iterator.peek()?.general_category_group();
let start_index = self.iterator.index();
let token_kind;
match initial_character_category_group {
GeneralCategoryGroup::Letter
| GeneralCategoryGroup::Number
| GeneralCategoryGroup::Mark => {
token_kind = SegmentedTokenKind::AlphaNumeric;
self.iterator.consume_chars(|c| {
matches!(
c.general_category_group(),
GeneralCategoryGroup::Letter
| GeneralCategoryGroup::Number
| GeneralCategoryGroup::Mark
)
});
}
GeneralCategoryGroup::Punctuation
| GeneralCategoryGroup::Separator
| GeneralCategoryGroup::Other => {
token_kind = SegmentedTokenKind::Separator;
self.iterator.consume_chars(|c| {
matches!(
c.general_category_group(),
GeneralCategoryGroup::Punctuation
| GeneralCategoryGroup::Separator
| GeneralCategoryGroup::Other
)
});
}
GeneralCategoryGroup::Symbol => {
token_kind = SegmentedTokenKind::Symbol;
self.iterator.consume_chars(|c| {
matches!(c.general_category_group(), GeneralCategoryGroup::Symbol)
|| c == '\u{200d}'
});
}
}
let end_index = self.iterator.index();
if start_index == end_index {
return None;
}
return Some(SegmentedToken::new(
self.text.get(start_index..end_index)?,
Some(token_kind),
));
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_initial_segmentation() {
let segmenter =
InitialParagraphSplitter::new("The quick brown 🦊fox jumps over the lazy 🐶dog.");
let result: Vec<&str> = segmenter.map(|s| s.text).collect();
assert_eq!(
result,
vec![
"The", " ", "quick", " ", "brown", " ", "🦊", "fox", " ", "jumps", " ", "over",
" ", "the", " ", "lazy", " ", "🐶", "dog", "."
]
);
let segmenter = InitialParagraphSplitter::new("👪 👩👩👧");
let result: Vec<&str> = segmenter.map(|s| s.text).collect();
assert_eq!(result, vec!["👪", " ", "👩👩👧"]);
}
}