unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use lingua::DetectionResult;
use lingua::Language;
use lingua::LanguageDetector;

use std::vec::IntoIter;

use crate::language_detection::detect_chinese_or_japanese;

/// A helper struct to help iterate over the results of a [ligua::LanguageDetector::detect_multiple_languages_of].
pub struct DetectionIterator<'a> {
	last_offset: usize,
	total_length: usize,
	inner: IntoIter<DetectionResult>,
	peek_buffer: Option<DetectionResult>,
	text: &'a str,
}

impl<'a> DetectionIterator<'a> {
	pub fn new(detections: Vec<DetectionResult>, text: &'a str) -> Self {
		Self {
			last_offset: 0,
			total_length: text.len(),
			inner: detections.into_iter(),
			peek_buffer: None,
			text,
		}
	}

	pub fn detect(detector: &LanguageDetector, text: &'a str) -> Self {
		let result_list = detector.detect_multiple_languages_of(text);
		Self::new(result_list, text)
	}
}

impl<'a> Iterator for DetectionIterator<'a> {
	/// (start_offset, end_offset, language)
	type Item = (usize, usize, Option<Language>);

	fn next(&mut self) -> Option<Self::Item> {
		let last_offset = self.last_offset;
		if let Some(detection) = self.peek_buffer.take().or_else(|| self.inner.next()) {
			if self.last_offset != detection.start_index() {
				self.last_offset = detection.start_index();
				self.peek_buffer = Some(detection);
				Some((last_offset, self.last_offset, None))
			} else {
				self.last_offset = detection.end_index();
				let mut language = detection.language();
				match language {
					Language::Chinese | Language::Japanese => {
						if let Some((lang, _)) =
							detect_chinese_or_japanese(&self.text[last_offset..self.last_offset])
						{
							language = lang;
						}
					}
					_ => { /* Noop */ }
				}
				Some((last_offset, self.last_offset, Some(language)))
			}
		} else if last_offset < self.total_length {
			self.last_offset = self.total_length;
			Some((last_offset, self.total_length, None))
		} else {
			None
		}
	}
}