unobtanium_segmenter/segmentation/
naive_word.rs

1use crate::initial_paragraph_splitter::InitialParagraphSplitter;
2
3use std::vec::IntoIter;
4
5use crate::segmentation::Segmenter;
6use crate::SegmentedToken;
7use crate::UseOrSubdivide;
8
9/// Naive word splitting that is based on tokens being cut where alphanimerics, space and symbols change from one type to another.
10///
11/// **⚠️ This segmenter is unstable and may change significantly outside of semver gurantees**
12///
13/// This currently isn't all that useful and as the name says naive.
14///
15/// Please prefer the [UnicodeWordSplitter][crate::segmentation::UnicodeWordSplitter].
16#[derive(Debug, Clone, Default)]
17pub struct NaiveWordSplitter {}
18
19impl NaiveWordSplitter {
20	/// Create a new UnicodeWordSplitter instance
21	pub fn new() -> Self {
22		Default::default()
23	}
24}
25
26impl Segmenter for NaiveWordSplitter {
27	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
28
29	fn subdivide<'a>(
30		&self,
31		token: SegmentedToken<'a>,
32	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
33		let mut output: Vec<SegmentedToken<'a>> = Vec::new();
34		for fragment in InitialParagraphSplitter::new(token.text) {
35			let mut new_token = SegmentedToken::new_derived_from(fragment.text, &token);
36			new_token.kind = fragment.kind;
37			output.push(fragment);
38		}
39		return UseOrSubdivide::Subdivide(output.into_iter());
40	}
41}