unobtanium_segmenter/segmentation/
unicode_sentence.rs1use unicode_segmentation::UnicodeSegmentation;
2
3use std::vec::IntoIter;
4
5use crate::SegmentedToken;
6use crate::UseOrSubdivide;
7
8use crate::segmentation::Segmenter;
9
10#[derive(Debug, Clone, Default)]
14pub struct UnicodeSentenceSplitter {}
15
16impl UnicodeSentenceSplitter {
17 pub fn new() -> Self {
19 Default::default()
20 }
21}
22
23impl Segmenter for UnicodeSentenceSplitter {
24 type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
25
26 fn subdivide<'a>(
27 &self,
28 token: SegmentedToken<'a>,
29 ) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
30 let sentence_iterator = token.text.split_sentence_bounds();
31 let mut collection = Vec::new();
32 for sentence in sentence_iterator {
33 collection.push(SegmentedToken::new_derived_from(sentence, &token));
34 }
35 if collection.len() > 1 {
36 return UseOrSubdivide::Subdivide(collection.into_iter());
37 } else {
38 return UseOrSubdivide::Use(token);
39 }
40 }
41}
42
43#[cfg(test)]
44mod test {
45
46 use super::*;
47
48 use crate::chain::ChainSegmenter;
49 use crate::chain::StartSegmentationChain;
50
51 #[test]
52 fn test_unicode_sentence_split() {
53 let test_text = "The quick (\"brown\") fox can't jump 32.3 feet, right?\nThe quick (\"brown\") fox. The value of π in german is '3,141592…'.";
54
55 let result: Vec<&str> = test_text
56 .start_segmentation_chain()
57 .chain_segmenter(&UnicodeSentenceSplitter::new())
58 .map(|t| t.text)
59 .collect();
60
61 let expected_tokens = vec![
62 "The quick (\"brown\") fox can't jump 32.3 feet, right?\n",
63 "The quick (\"brown\") fox. ",
64 "The value of π in german is '3,141592…'.",
65 ];
66
67 assert_eq!(result, expected_tokens);
68 }
69}