unobtanium-segmenter 0.5.2

A text segmentation toolbox for search applications inspired by charabia and tantivy.
Documentation
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only

use whatlang::Lang;
use whatlang::Script;

/// The main representation of data this crate works on.
///
/// A token is effectively `text` with metadata attached, this struct being the metadata carrier.
#[derive(Debug, Clone, PartialEq)]
pub struct SegmentedToken<'a> {
	/// The piece of text that this token represents.
	///
	/// This should be borrowed from the initial text that was fed to the segmenter chain.
	pub text: &'a str,

	/// If a [normalizer](crate::normalization) output was different from `text` the result will be stored in here.
	///
	/// It is recommended that you fetch normalized text using [get_text_prefer_normalized()][Self::get_text_prefer_normalized] or [get_text_prefer_normalized_owned()][Self::get_text_prefer_normalized_owned].
	pub normalized_text: NormalizedText,

	/// Which language the normalization that resulted in `normalized_text` happend with.
	///
	/// `None` means that only language independent normalizations were applied, for language dependent normalizations this should make sure they're all applied for the same language.
	///
	/// If already set to `Some` it shouldn't be changed.
	pub normalization_language: Option<Lang>,

	/// What kind of token this is.
	///
	/// Set by:
	/// * [NaiveWordSplitter][crate::segmentation::NaiveWordSplitter]
	/// * [AugmentationClassify][crate::augmentation::AugmentationClassify]
	pub kind: Option<SegmentedTokenKind>,

	/// The primary script as detected by a script or language detection augmenter.
	///
	/// Information about detected scripts is inherited across splitting.
	pub detected_script: Option<Script>,

	/// The primary language detected by a language detection augmenter.
	///
	/// Information about detected languages in inherited across splitting.
	pub detected_language: Option<Lang>,

	/// How confident the language detector was about the language that it detectd.
	///
	/// This scales inbetween `0` (not confident at all) and `1` (most confident).
	pub detected_language_confidence: f64,

	/// Wheter the language detector considers its output to be reliable.
	pub is_detected_language_relible: bool,

	/// Indicates that no further splitting is neccessary.
	///
	/// This should be set to true if the token was a valid word in a dictionary.
	pub is_known_word: bool,

	/// Indicates that this token marks the end of a sentence.
	///
	/// This should only be set on tokens with an empty `text` field. It is not inherited.
	pub is_end_of_sentence: bool,
}

impl<'a> SegmentedToken<'a> {
	/// Create a segmented token from scratch. (You likely won't need it)
	///
	/// If you are wwriting a segmenter have a look at [new_derived_from()][Self::new_derived_from].
	///
	/// For creating the initial token consider usng the `From` implementations or the [StartSegmentationChain][crate::chain::StartSegmentationChain] trait.
	pub fn new(text: &'a str, kind: Option<SegmentedTokenKind>) -> Self {
		let mut out = Self {
			text,
			kind,
			normalized_text: NormalizedText::NotNormalized,
			normalization_language: None,
			is_known_word: false,
			detected_script: None,
			detected_language: None,
			detected_language_confidence: 0.0,
			is_detected_language_relible: false,
			is_end_of_sentence: false,
		};
		match kind {
			Some(SegmentedTokenKind::AlphaNumeric) => {
				out.detected_script = whatlang::detect_script(text);
			}
			_ => { /* Do nothing */ }
		}
		return out;
	}

	/// Create a token with a given text that inerits metadata from the `from` token.
	///
	/// This is the recommended constructor to use inside a segmenter after splitting.
	pub fn new_derived_from(text: &'a str, from: &Self) -> Self {
		Self {
			text,
			kind: None,
			is_known_word: from.is_known_word,
			detected_script: from.detected_script,
			detected_language: from.detected_language,
			detected_language_confidence: from.detected_language_confidence,
			is_detected_language_relible: from.is_detected_language_relible,
			normalized_text: NormalizedText::NotNormalized,
			normalization_language: None,
			is_end_of_sentence: false,
		}
	}

	/// Create a new token that carries an `is_end_of_sentence` marker.
	///
	/// Recommended way of deriving the empty text:
	/// ```rust
	/// # use unobtanium_segmenter::SegmentedToken;
	/// # let token = SegmentedToken::new("Some example sentence to segment.", None);
	/// # let sentence = token.text; // Actual segmenter goes here
	/// let (main, tail) = sentence.split_at(sentence.len());
	/// SegmentedToken::new_derived_from(main, &token);
	/// SegmentedToken::new_end_of_sentence(tail);
	/// ```
	pub fn new_end_of_sentence(empty_text: &'a str) -> Self {
		let mut new = Self::new(empty_text, None);
		new.is_end_of_sentence = true;
		return new;
	}

	/// Helper function to convert texts that came ot of a simple helper function
	/// back into segments.
	///
	/// Using this implies that further segmenting didn't change anything
	/// for the metadta of the child segments.
	pub fn covert_to_child_segements_of_self(
		&'a self,
		texts: &'a [&'a str],
	) -> impl Iterator<Item = SegmentedToken<'a>> + 'a {
		texts.iter().map(|text| Self::new_derived_from(text, self))
	}

	/// Builder like convenience function to set the `is_known_word` flag.
	#[inline(always)]
	pub fn with_is_kown_word(mut self, is_known_word: bool) -> Self {
		self.is_known_word = is_known_word;
		return self;
	}

	/// Builder like convenience function to set the detected language.
	#[inline(always)]
	pub fn with_detected_language(
		mut self,
		lang: Option<Lang>,
		is_relible: bool,
		confidence: f64,
	) -> Self {
		self.detected_language = lang;
		self.is_detected_language_relible = is_relible;
		self.detected_language_confidence = confidence;
		self
	}

	/// Return the `normalized_text` of this token if present and `text` if not as a `str`.
	pub fn get_text_prefer_normalized(&self) -> &str {
		if let NormalizedText::Normalized(normalized_text) = &self.normalized_text {
			return normalized_text.as_str();
		} else {
			return self.text;
		}
	}

	/// This is the same as [get_text_prefer_normalized()][Self::get_text_prefer_normalized], but returns an owned String instead.
	pub fn get_text_prefer_normalized_owned(&self) -> String {
		self.get_text_prefer_normalized().to_string()
	}

	/// Returns the normalized text behind this token.
	///
	/// If the normalization is [NormalizedText::NormalizedToSelf] it'll return the original text.
	///
	/// It will only return `None` if not normalization was applied.
	pub fn get_normalized_text(&self) -> Option<&str> {
		match &self.normalized_text {
			NormalizedText::NotNormalized => None,
			NormalizedText::NormalizedToSelf => Some(self.text),
			NormalizedText::Normalized(text) => Some(text.as_str()),
		}
	}

	/// Update this tokens normalized text with an unowned `&str`.
	///
	/// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
	///
	/// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
	pub fn update_normalized_str(&mut self, normalized: &str, lang: Option<Lang>) {
		if self.text == normalized {
			self.normalized_text = NormalizedText::NormalizedToSelf;
		} else {
			self.normalized_text = NormalizedText::Normalized(normalized.to_owned())
		}
		self.update_normalization_language(lang);
	}

	/// Update this tokens normalized text with an owned `String`.
	///
	/// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
	///
	/// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
	pub fn update_normalized_string(&mut self, normalized: String, lang: Option<Lang>) {
		if self.text == normalized {
			self.normalized_text = NormalizedText::NormalizedToSelf;
		} else {
			self.normalized_text = NormalizedText::Normalized(normalized);
		}
		self.update_normalization_language(lang);
	}

	/// Update the normalization language, `None` means languge independent
	pub fn update_normalization_language(&mut self, lang: Option<Lang>) {
		if lang.is_some() {
			self.normalization_language = lang;
		}
	}

	/// Returns wheather the text was normalized or not.
	#[inline]
	pub fn was_normalized(&self) -> bool {
		!matches!(self.normalized_text, NormalizedText::NotNormalized)
	}
}

impl<'a> From<&'a String> for SegmentedToken<'a> {
	fn from(value: &'a String) -> Self {
		Self::new(value, None)
	}
}

impl<'a> From<&'a str> for SegmentedToken<'a> {
	fn from(value: &'a str) -> Self {
		Self::new(value, None)
	}
}

/// What kind of content to expect from a [SegmentedToken].
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SegmentedTokenKind {
	/// The token is a collection of any kind of letters and numbers
	AlphaNumeric,

	/// The token is some kind of seperator
	Separator,

	/// The token represents a symbol
	Symbol,
}

/// Represents the outcomes of text normalization that can happen on a [SegmentedToken].
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub enum NormalizedText {
	/// The token was not normalized, either it didn'T pass a normalization stage, or the normlaization stage didn't attempt normlaization (i.e. because no algorithm is implemented for a specific language)
	#[default]
	NotNormalized,

	/// The token was normalized, but the result is the same as the original text
	NormalizedToSelf,

	/// The token was normalized into something that is not the original text.
	Normalized(String),
}

impl From<NormalizedText> for Option<String> {
	fn from(value: NormalizedText) -> Self {
		if let NormalizedText::Normalized(text) = value {
			Some(text)
		} else {
			None
		}
	}
}