unobtanium_segmenter/
sentence_grouped_iterator.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use crate::SegmentedToken;
6
7/// Iterator that wraps another Iterator of [`SegmentedToken`s][SegmentedToken] and inserts a break after each end of sentence token, which interrupts a `for`-loop but can be used.
8///
9/// End of sentence markers that don't carry any text are automatically discarded.
10///
11/// It splits right after tokens tht have the at the [`is_end_of_sentence`][SegmentedToken::is_end_of_sentence] flag set.
12pub struct SentenceGroupedIterator<'a, I: Iterator<Item = SegmentedToken<'a>>> {
13	inner: I,
14	next_is_break: bool,
15	is_at_end: bool,
16}
17
18impl<'a, I: Iterator<Item = SegmentedToken<'a>>> SentenceGroupedIterator<'a, I> {
19	/// Create a new sentence collector that wraps the given iterator.
20	pub fn new(inner: I) -> Self {
21		Self {
22			inner,
23			next_is_break: false,
24			is_at_end: false,
25		}
26	}
27
28	/// Returns wheter the last iteration has stopped because the inner iterator
29	/// has hit its end.
30	pub fn is_at_end(&self) -> bool {
31		self.is_at_end
32	}
33}
34
35impl<'a, I: Iterator<Item = SegmentedToken<'a>>> Iterator for SentenceGroupedIterator<'a, I> {
36	type Item = SegmentedToken<'a>;
37
38	fn next(&mut self) -> Option<Self::Item> {
39		if self.next_is_break {
40			self.next_is_break = false;
41			None
42		} else if let Some(token) = self.inner.next() {
43			self.is_at_end = false;
44			if token.is_end_of_sentence {
45				if token.text.is_empty() {
46					// Empty marker token, return None immedeately
47					None
48				} else {
49					// Non-empty, but marked token return it and schedule a break
50					self.next_is_break = true;
51					Some(token)
52				}
53			} else {
54				// Not end of sentence, just return the token
55				Some(token)
56			}
57		} else {
58			self.is_at_end = true;
59			None
60		}
61	}
62}