unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use crate::initial_paragraph_splitter::InitialParagraphSplitter;

use std::vec::IntoIter;

use crate::SegmentedToken;
use crate::UseOrSubdivide;
use crate::segmentation::Segmenter;

/// Naive word splitting that is based on tokens being cut where alphanimerics, space and symbols change from one type to another.
///
/// **⚠️ This segmenter is unstable and may change significantly outside of semver gurantees**
///
/// This currently isn't all that useful and as the name says naive.
///
/// Please prefer the [UnicodeWordSplitter][crate::segmentation::UnicodeWordSplitter].
#[derive(Debug, Clone, Default)]
pub struct NaiveWordSplitter {}

impl NaiveWordSplitter {
	/// Create a new UnicodeWordSplitter instance
	pub fn new() -> Self {
		Default::default()
	}
}

impl Segmenter for NaiveWordSplitter {
	type SubdivisionIter<'a> = IntoIter<SegmentedToken<'a>>;

	fn subdivide<'a>(
		&self,
		token: SegmentedToken<'a>,
	) -> UseOrSubdivide<SegmentedToken<'a>, IntoIter<SegmentedToken<'a>>> {
		let mut output: Vec<SegmentedToken<'a>> = Vec::new();
		for fragment in InitialParagraphSplitter::new(token.text) {
			let mut new_token = SegmentedToken::new_derived_from(fragment.text, &token);
			new_token.kind = fragment.kind;
			output.push(fragment);
		}
		return UseOrSubdivide::Subdivide(output.into_iter());
	}
}