Skip to main content

unobtanium_segmenter/segmentation/
unicode_sentence.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use unicode_segmentation::UnicodeSegmentation;
6
7use std::vec::IntoIter;
8
9use crate::SegmentedToken;
10use crate::UseOrSubdivide;
11
12use crate::segmentation::Segmenter;
13
14/// Split Large chunks of text into sentances according to the Unicode definition of a Sentence.
15///
16/// Uses [[UnicodeSegmentation::split_sentence_bounds]] under the hood.
17#[derive(Debug, Clone, Default)]
18pub struct UnicodeSentenceSplitter {}
19
20impl UnicodeSentenceSplitter {
21	/// Create a new UnicodeSentenceSplitter instance
22	pub fn new() -> Self {
23		Default::default()
24	}
25}
26
27impl Segmenter for UnicodeSentenceSplitter {
28	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
29
30	fn subdivide<'a>(
31		&self,
32		token: SegmentedToken<'a>,
33	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
34		let sentence_iterator = token.text.split_sentence_bounds();
35		let mut collection = Vec::new();
36		for sentence in sentence_iterator {
37			let (main, tail) = sentence.split_at(sentence.len());
38			collection.push(SegmentedToken::new_derived_from(main, &token));
39			collection.push(SegmentedToken::new_end_of_sentence(tail));
40		}
41		if collection.len() > 1 {
42			return UseOrSubdivide::Subdivide(collection.into_iter());
43		} else {
44			return UseOrSubdivide::Use(token);
45		}
46	}
47}
48
49#[cfg(test)]
50mod test {
51
52	use super::*;
53
54	use crate::chain::ChainSegmenter;
55	use crate::chain::StartSegmentationChain;
56
57	#[test]
58	fn test_unicode_sentence_split() {
59		let test_text = "The quick (\"brown\") fox can't jump 32.3 feet, right?\nThe quick (\"brown\")  fox. The value of π in german is '3,141592…'.";
60
61		let result: Vec<&str> = test_text
62			.start_segmentation_chain()
63			.chain_segmenter(&UnicodeSentenceSplitter::new())
64			.map(|t| t.text)
65			.collect();
66
67		let expected_tokens = vec![
68			"The quick (\"brown\") fox can't jump 32.3 feet, right?\n",
69			"",
70			"The quick (\"brown\")  fox. ",
71			"",
72			"The value of π in german is '3,141592…'.",
73			"",
74		];
75
76		assert_eq!(result, expected_tokens);
77	}
78}