unobtanium_segmenter/segmentation/
unicode_sentence.rs

1use unicode_segmentation::UnicodeSegmentation;
2
3use std::vec::IntoIter;
4
5use crate::SegmentedToken;
6use crate::UseOrSubdivide;
7
8use crate::segmentation::Segmenter;
9
10/// Split Large chunks of text into sentances according to the Unicode definition of a Sentence.
11///
12/// Uses [[UnicodeSegmentation::split_sentence_bounds]] under the hood.
13#[derive(Debug, Clone, Default)]
14pub struct UnicodeSentenceSplitter {}
15
16impl UnicodeSentenceSplitter {
17	/// Create a new UnicodeSentenceSplitter instance
18	pub fn new() -> Self {
19		Default::default()
20	}
21}
22
23impl Segmenter for UnicodeSentenceSplitter {
24	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
25
26	fn subdivide<'a>(
27		&self,
28		token: SegmentedToken<'a>,
29	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
30		let sentence_iterator = token.text.split_sentence_bounds();
31		let mut collection = Vec::new();
32		for sentence in sentence_iterator {
33			let (main, tail) = sentence.split_at(sentence.len());
34			collection.push(SegmentedToken::new_derived_from(main, &token));
35			collection.push(SegmentedToken::new_end_of_sentence(tail));
36		}
37		if collection.len() > 1 {
38			return UseOrSubdivide::Subdivide(collection.into_iter());
39		} else {
40			return UseOrSubdivide::Use(token);
41		}
42	}
43}
44
45#[cfg(test)]
46mod test {
47
48	use super::*;
49
50	use crate::chain::ChainSegmenter;
51	use crate::chain::StartSegmentationChain;
52
53	#[test]
54	fn test_unicode_sentence_split() {
55		let test_text = "The quick (\"brown\") fox can't jump 32.3 feet, right?\nThe quick (\"brown\")  fox. The value of π in german is '3,141592…'.";
56
57		let result: Vec<&str> = test_text
58			.start_segmentation_chain()
59			.chain_segmenter(&UnicodeSentenceSplitter::new())
60			.map(|t| t.text)
61			.collect();
62
63		let expected_tokens = vec![
64			"The quick (\"brown\") fox can't jump 32.3 feet, right?\n",
65			"",
66			"The quick (\"brown\")  fox. ",
67			"",
68			"The value of π in german is '3,141592…'.",
69			"",
70		];
71
72		assert_eq!(result, expected_tokens);
73	}
74}