unobtanium_segmenter/segmentation/
unicode_word.rs1use unicode_segmentation::UnicodeSegmentation;
6
7use std::vec::IntoIter;
8
9use crate::SegmentedToken;
10use crate::UseOrSubdivide;
11use crate::segmentation::Segmenter;
12
13#[derive(Debug, Clone, Default)]
17pub struct UnicodeWordSplitter {}
18
19impl UnicodeWordSplitter {
20 pub fn new() -> Self {
22 Default::default()
23 }
24}
25
26impl Segmenter for UnicodeWordSplitter {
27 type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
28
29 fn subdivide<'a>(
30 &self,
31 token: SegmentedToken<'a>,
32 ) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
33 let word_iterator = token.text.split_word_bounds();
34 let mut collection = Vec::new();
35 for word in word_iterator {
36 collection.push(SegmentedToken::new_derived_from(word, &token));
37 }
38 if collection.len() > 1 {
39 return UseOrSubdivide::Subdivide(collection.into_iter());
40 } else {
41 return UseOrSubdivide::Use(token);
42 }
43 }
44}
45
46#[cfg(test)]
47mod test {
48
49 use super::*;
50
51 use crate::chain::ChainSegmenter;
52 use crate::chain::StartSegmentationChain;
53
54 #[test]
55 fn test_unicode_word_split() {
56 let test_text = "The quick (\"brown\") fox can't jump 32.3 feet, right?\nThe quick (\"brown\") fox. The value of π in german is '3,141592…'.";
57
58 let result: Vec<&str> = test_text
59 .start_segmentation_chain()
60 .chain_segmenter(&UnicodeWordSplitter::new())
61 .map(|t| t.text)
62 .collect();
63
64 let expected_tokens = vec![
65 "The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can't", " ",
66 "jump", " ", "32.3", " ", "feet", ",", " ", "right", "?", "\n", "The", " ", "quick",
67 " ", "(", "\"", "brown", "\"", ")", " ", "fox", ".", " ", "The", " ", "value", " ",
68 "of", " ", "π", " ", "in", " ", "german", " ", "is", " ", "'", "3,141592", "…", "'",
69 ".",
70 ];
71
72 assert_eq!(result, expected_tokens);
73 }
74}