use unicode_segmentation::UnicodeSegmentation;
use std::vec::IntoIter;
use crate::SegmentedToken;
use crate::UseOrSubdivide;
use crate::segmentation::Segmenter;
#[derive(Debug, Clone, Default)]
pub struct UnicodeWordSplitter {}
impl UnicodeWordSplitter {
pub fn new() -> Self {
Default::default()
}
}
impl Segmenter for UnicodeWordSplitter {
type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
fn subdivide<'a>(
&self,
token: SegmentedToken<'a>,
) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
let word_iterator = token.text.split_word_bounds();
let mut collection = Vec::new();
for word in word_iterator {
collection.push(SegmentedToken::new_derived_from(word, &token));
}
if collection.len() > 1 {
return UseOrSubdivide::Subdivide(collection.into_iter());
} else {
return UseOrSubdivide::Use(token);
}
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::chain::ChainSegmenter;
use crate::chain::StartSegmentationChain;
#[test]
fn test_unicode_word_split() {
let test_text = "The quick (\"brown\") fox can't jump 32.3 feet, right?\nThe quick (\"brown\") fox. The value of π in german is '3,141592…'.";
let result: Vec<&str> = test_text
.start_segmentation_chain()
.chain_segmenter(&UnicodeWordSplitter::new())
.map(|t| t.text)
.collect();
let expected_tokens = vec![
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can't", " ",
"jump", " ", "32.3", " ", "feet", ",", " ", "right", "?", "\n", "The", " ", "quick",
" ", "(", "\"", "brown", "\"", ")", " ", "fox", ".", " ", "The", " ", "value", " ",
"of", " ", "π", " ", "in", " ", "german", " ", "is", " ", "'", "3,141592", "…", "'",
".",
];
assert_eq!(result, expected_tokens);
}
}