unobtanium_segmenter/segmentation/
naive_word.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use crate::initial_paragraph_splitter::InitialParagraphSplitter;
6
7use std::vec::IntoIter;
8
9use crate::SegmentedToken;
10use crate::UseOrSubdivide;
11use crate::segmentation::Segmenter;
12
13/// Naive word splitting that is based on tokens being cut where alphanimerics, space and symbols change from one type to another.
14///
15/// **⚠️ This segmenter is unstable and may change significantly outside of semver gurantees**
16///
17/// This currently isn't all that useful and as the name says naive.
18///
19/// Please prefer the [UnicodeWordSplitter][crate::segmentation::UnicodeWordSplitter].
20#[derive(Debug, Clone, Default)]
21pub struct NaiveWordSplitter {}
22
23impl NaiveWordSplitter {
24	/// Create a new UnicodeWordSplitter instance
25	pub fn new() -> Self {
26		Default::default()
27	}
28}
29
30impl Segmenter for NaiveWordSplitter {
31	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;
32
33	fn subdivide<'a>(
34		&self,
35		token: SegmentedToken<'a>,
36	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
37		let mut output: Vec<SegmentedToken<'a>> = Vec::new();
38		for fragment in InitialParagraphSplitter::new(token.text) {
39			let mut new_token = SegmentedToken::new_derived_from(fragment.text, &token);
40			new_token.kind = fragment.kind;
41			output.push(fragment);
42		}
43		return UseOrSubdivide::Subdivide(output.into_iter());
44	}
45}