unobtanium_segmenter/segmentation/
unicode_sentence.rs1use unicode_segmentation::UnicodeSegmentation;
2
3use std::vec::IntoIter;
4
5use crate::SegmentedToken;
6use crate::UseOrSubdivide;
7
8use crate::segmentation::Segmenter;
9
10#[derive(Debug, Clone, Default)]
14pub struct UnicodeSentenceSplitter {}
15
16impl UnicodeSentenceSplitter {
17 pub fn new() -> Self {
19 Default::default()
20 }
21}
22
23impl Segmenter for UnicodeSentenceSplitter {
24 type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
25
26 fn subdivide<'a>(
27 &self,
28 token: SegmentedToken<'a>,
29 ) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
30 let sentence_iterator = token.text.split_sentence_bounds();
31 let mut collection = Vec::new();
32 for sentence in sentence_iterator {
33 let (main, tail) = sentence.split_at(sentence.len());
34 collection.push(SegmentedToken::new_derived_from(main, &token));
35 collection.push(SegmentedToken::new_end_of_sentence(tail));
36 }
37 if collection.len() > 1 {
38 return UseOrSubdivide::Subdivide(collection.into_iter());
39 } else {
40 return UseOrSubdivide::Use(token);
41 }
42 }
43}
44
45#[cfg(test)]
46mod test {
47
48 use super::*;
49
50 use crate::chain::ChainSegmenter;
51 use crate::chain::StartSegmentationChain;
52
53 #[test]
54 fn test_unicode_sentence_split() {
55 let test_text = "The quick (\"brown\") fox can't jump 32.3 feet, right?\nThe quick (\"brown\") fox. The value of π in german is '3,141592…'.";
56
57 let result: Vec<&str> = test_text
58 .start_segmentation_chain()
59 .chain_segmenter(&UnicodeSentenceSplitter::new())
60 .map(|t| t.text)
61 .collect();
62
63 let expected_tokens = vec![
64 "The quick (\"brown\") fox can't jump 32.3 feet, right?\n",
65 "",
66 "The quick (\"brown\") fox. ",
67 "",
68 "The value of π in german is '3,141592…'.",
69 "",
70 ];
71
72 assert_eq!(result, expected_tokens);
73 }
74}