unobtanium_segmenter/segmentation/
unicode_sentence.rs1use unicode_segmentation::UnicodeSegmentation;
6
7use std::vec::IntoIter;
8
9use crate::SegmentedToken;
10use crate::UseOrSubdivide;
11
12use crate::segmentation::Segmenter;
13
14#[derive(Debug, Clone, Default)]
18pub struct UnicodeSentenceSplitter {}
19
20impl UnicodeSentenceSplitter {
21 pub fn new() -> Self {
23 Default::default()
24 }
25}
26
27impl Segmenter for UnicodeSentenceSplitter {
28 type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
29
30 fn subdivide<'a>(
31 &self,
32 token: SegmentedToken<'a>,
33 ) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
34 let sentence_iterator = token.text.split_sentence_bounds();
35 let mut collection = Vec::new();
36 for sentence in sentence_iterator {
37 let (main, tail) = sentence.split_at(sentence.len());
38 collection.push(SegmentedToken::new_derived_from(main, &token));
39 collection.push(SegmentedToken::new_end_of_sentence(tail));
40 }
41 if collection.len() > 1 {
42 return UseOrSubdivide::Subdivide(collection.into_iter());
43 } else {
44 return UseOrSubdivide::Use(token);
45 }
46 }
47}
48
49#[cfg(test)]
50mod test {
51
52 use super::*;
53
54 use crate::chain::ChainSegmenter;
55 use crate::chain::StartSegmentationChain;
56
57 #[test]
58 fn test_unicode_sentence_split() {
59 let test_text = "The quick (\"brown\") fox can't jump 32.3 feet, right?\nThe quick (\"brown\") fox. The value of π in german is '3,141592…'.";
60
61 let result: Vec<&str> = test_text
62 .start_segmentation_chain()
63 .chain_segmenter(&UnicodeSentenceSplitter::new())
64 .map(|t| t.text)
65 .collect();
66
67 let expected_tokens = vec![
68 "The quick (\"brown\") fox can't jump 32.3 feet, right?\n",
69 "",
70 "The quick (\"brown\") fox. ",
71 "",
72 "The value of π in german is '3,141592…'.",
73 "",
74 ];
75
76 assert_eq!(result, expected_tokens);
77 }
78}