unobtanium_segmenter/segmentation/
unicode_word.rs

1use unicode_segmentation::UnicodeSegmentation;
2
3use std::vec::IntoIter;
4
5use crate::segmentation::Segmenter;
6use crate::SegmentedToken;
7use crate::UseOrSubdivide;
8
9/// Split text into words according to the Unicode definition of what a word is. While not perfect, it should work well enough as an easy starting point.
10///
11/// Uses [UnicodeSegmentation::split_sentence_bounds] under the hood.
12#[derive(Debug, Clone, Default)]
13pub struct UnicodeWordSplitter {}
14
15impl UnicodeWordSplitter {
16	/// Create a new UnicodeWordSplitter instance
17	pub fn new() -> Self {
18		Default::default()
19	}
20}
21
22impl Segmenter for UnicodeWordSplitter {
23	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
24
25	fn subdivide<'a>(
26		&self,
27		token: SegmentedToken<'a>,
28	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
29		let word_iterator = token.text.split_word_bounds();
30		let mut collection = Vec::new();
31		for word in word_iterator {
32			collection.push(SegmentedToken::new_derived_from(word, &token));
33		}
34		if collection.len() > 1 {
35			return UseOrSubdivide::Subdivide(collection.into_iter());
36		} else {
37			return UseOrSubdivide::Use(token);
38		}
39	}
40}
41
42#[cfg(test)]
43mod test {
44
45	use super::*;
46
47	use crate::chain::ChainSegmenter;
48	use crate::chain::StartSegmentationChain;
49
50	#[test]
51	fn test_unicode_word_split() {
52		let test_text = "The quick (\"brown\") fox can't jump 32.3 feet, right?\nThe quick (\"brown\")  fox. The value of π in german is '3,141592…'.";
53
54		let result: Vec<&str> = test_text
55			.start_segmentation_chain()
56			.chain_segmenter(&UnicodeWordSplitter::new())
57			.map(|t| t.text)
58			.collect();
59
60		let expected_tokens = vec![
61			"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", "fox", " ", "can't", " ",
62			"jump", " ", "32.3", " ", "feet", ",", " ", "right", "?", "\n", "The", " ", "quick",
63			" ", "(", "\"", "brown", "\"", ")", "  ", "fox", ".", " ", "The", " ", "value", " ",
64			"of", " ", "π", " ", "in", " ", "german", " ", "is", " ", "'", "3,141592", "…", "'",
65			".",
66		];
67
68		assert_eq!(result, expected_tokens);
69	}
70}
unobtanium_segmenter/segmentation/unicode_word.rs

unobtanium_segmenter/segmentation/
unicode_word.rs