unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use crate::SegmentedToken;

/// Iterator that wraps another Iterator of [`SegmentedToken`s][SegmentedToken] and inserts a break after each end of sentence token, which interrupts a `for`-loop but can be used.
///
/// End of sentence markers that don't carry any text are automatically discarded.
///
/// It splits right after tokens tht have the at the [`is_end_of_sentence`][SegmentedToken::is_end_of_sentence] flag set.
pub struct SentenceGroupedIterator<'a, I: Iterator<Item = SegmentedToken<'a>>> {
	inner: I,
	next_is_break: bool,
	is_at_end: bool,
}

impl<'a, I: Iterator<Item = SegmentedToken<'a>>> SentenceGroupedIterator<'a, I> {
	/// Create a new sentence collector that wraps the given iterator.
	pub fn new(inner: I) -> Self {
		Self {
			inner,
			next_is_break: false,
			is_at_end: false,
		}
	}

	/// Returns wheter the last iteration has stopped because the inner iterator
	/// has hit its end.
	pub fn is_at_end(&self) -> bool {
		self.is_at_end
	}
}

impl<'a, I: Iterator<Item = SegmentedToken<'a>>> Iterator for SentenceGroupedIterator<'a, I> {
	type Item = SegmentedToken<'a>;

	fn next(&mut self) -> Option<Self::Item> {
		if self.next_is_break {
			self.next_is_break = false;
			None
		} else if let Some(token) = self.inner.next() {
			self.is_at_end = false;
			if token.is_end_of_sentence {
				if token.text.is_empty() {
					// Empty marker token, return None immedeately
					None
				} else {
					// Non-empty, but marked token return it and schedule a break
					self.next_is_break = true;
					Some(token)
				}
			} else {
				// Not end of sentence, just return the token
				Some(token)
			}
		} else {
			self.is_at_end = true;
			None
		}
	}
}