Skip to main content

unobtanium_segmenter/
segmented_token.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use whatlang::Lang;
6use whatlang::Script;
7
8/// The main representation of data this crate works on.
9///
10/// A token is effectively `text` with metadata attached, this struct being the metadata carrier.
11#[derive(Debug, Clone, PartialEq)]
12pub struct SegmentedToken<'a> {
13	/// The piece of text that this token represents.
14	///
15	/// This should be borrowed from the initial text that was fed to the segmenter chain.
16	pub text: &'a str,
17
18	/// If a [normalizer](crate::normalization) output was different from `text` the result will be stored in here.
19	///
20	/// It is recommended that you fetch normalized text using [get_text_prefer_normalized()][Self::get_text_prefer_normalized] or [get_text_prefer_normalized_owned()][Self::get_text_prefer_normalized_owned].
21	pub normalized_text: NormalizedText,
22
23	/// Which language the normalization that resulted in `normalized_text` happend with.
24	///
25	/// `None` means that only language independent normalizations were applied, for language dependent normalizations this should make sure they're all applied for the same language.
26	///
27	/// If already set to `Some` it shouldn't be changed.
28	pub normalization_language: Option<Lang>,
29
30	/// What kind of token this is.
31	///
32	/// Set by:
33	/// * [NaiveWordSplitter][crate::segmentation::NaiveWordSplitter]
34	/// * [AugmentationClassify][crate::augmentation::AugmentationClassify]
35	pub kind: Option<SegmentedTokenKind>,
36
37	/// The primary script as detected by a script or language detection augmenter.
38	///
39	/// Information about detected scripts is inherited across splitting.
40	pub detected_script: Option<Script>,
41
42	/// The primary language detected by a language detection augmenter.
43	///
44	/// Information about detected languages in inherited across splitting.
45	pub detected_language: Option<Lang>,
46
47	/// How confident the language detector was about the language that it detectd.
48	///
49	/// This scales inbetween `0` (not confident at all) and `1` (most confident).
50	pub detected_language_confidence: f64,
51
52	/// Wheter the language detector considers its output to be reliable.
53	pub is_detected_language_relible: bool,
54
55	/// Indicates that no further splitting is neccessary.
56	///
57	/// This should be set to true if the token was a valid word in a dictionary.
58	pub is_known_word: bool,
59
60	/// Indicates that this token marks the end of a sentence.
61	///
62	/// This should only be set on tokens with an empty `text` field. It is not inherited.
63	pub is_end_of_sentence: bool,
64}
65
66impl<'a> SegmentedToken<'a> {
67	/// Create a segmented token from scratch. (You likely won't need it)
68	///
69	/// If you are wwriting a segmenter have a look at [new_derived_from()][Self::new_derived_from].
70	///
71	/// For creating the initial token consider usng the `From` implementations or the [StartSegmentationChain][crate::chain::StartSegmentationChain] trait.
72	pub fn new(text: &'a str, kind: Option<SegmentedTokenKind>) -> Self {
73		let mut out = Self {
74			text,
75			kind,
76			normalized_text: NormalizedText::NotNormalized,
77			normalization_language: None,
78			is_known_word: false,
79			detected_script: None,
80			detected_language: None,
81			detected_language_confidence: 0.0,
82			is_detected_language_relible: false,
83			is_end_of_sentence: false,
84		};
85		match kind {
86			Some(SegmentedTokenKind::AlphaNumeric) => {
87				out.detected_script = whatlang::detect_script(text);
88			}
89			_ => { /* Do nothing */ }
90		}
91		return out;
92	}
93
94	/// Create a token with a given text that inerits metadata from the `from` token.
95	///
96	/// This is the recommended constructor to use inside a segmenter after splitting.
97	pub fn new_derived_from(text: &'a str, from: &Self) -> Self {
98		Self {
99			text,
100			kind: None,
101			is_known_word: from.is_known_word,
102			detected_script: from.detected_script,
103			detected_language: from.detected_language,
104			detected_language_confidence: from.detected_language_confidence,
105			is_detected_language_relible: from.is_detected_language_relible,
106			normalized_text: NormalizedText::NotNormalized,
107			normalization_language: None,
108			is_end_of_sentence: false,
109		}
110	}
111
112	/// Create a new token that carries an `is_end_of_sentence` marker.
113	///
114	/// Recommended way of deriving the empty text:
115	/// ```rust
116	/// # use unobtanium_segmenter::SegmentedToken;
117	/// # let token = SegmentedToken::new("Some example sentence to segment.", None);
118	/// # let sentence = token.text; // Actual segmenter goes here
119	/// let (main, tail) = sentence.split_at(sentence.len());
120	/// SegmentedToken::new_derived_from(main, &token);
121	/// SegmentedToken::new_end_of_sentence(tail);
122	/// ```
123	pub fn new_end_of_sentence(empty_text: &'a str) -> Self {
124		let mut new = Self::new(empty_text, None);
125		new.is_end_of_sentence = true;
126		return new;
127	}
128
129	/// Helper function to convert texts that came ot of a simple helper function
130	/// back into segments.
131	///
132	/// Using this implies that further segmenting didn't change anything
133	/// for the metadta of the child segments.
134	pub fn covert_to_child_segements_of_self(
135		&'a self,
136		texts: &'a [&'a str],
137	) -> impl Iterator<Item = SegmentedToken<'a>> + 'a {
138		texts.iter().map(|text| Self::new_derived_from(text, self))
139	}
140
141	/// Builder like convenience function to set the `is_known_word` flag.
142	#[inline(always)]
143	pub fn with_is_kown_word(mut self, is_known_word: bool) -> Self {
144		self.is_known_word = is_known_word;
145		return self;
146	}
147
148	/// Builder like convenience function to set the detected language.
149	#[inline(always)]
150	pub fn with_detected_language(
151		mut self,
152		lang: Option<Lang>,
153		is_relible: bool,
154		confidence: f64,
155	) -> Self {
156		self.detected_language = lang;
157		self.is_detected_language_relible = is_relible;
158		self.detected_language_confidence = confidence;
159		self
160	}
161
162	/// Return the `normalized_text` of this token if present and `text` if not as a `str`.
163	pub fn get_text_prefer_normalized(&self) -> &str {
164		if let NormalizedText::Normalized(normalized_text) = &self.normalized_text {
165			return normalized_text.as_str();
166		} else {
167			return self.text;
168		}
169	}
170
171	/// This is the same as [get_text_prefer_normalized()][Self::get_text_prefer_normalized], but returns an owned String instead.
172	pub fn get_text_prefer_normalized_owned(&self) -> String {
173		self.get_text_prefer_normalized().to_string()
174	}
175
176	/// Returns the normalized text behind this token.
177	///
178	/// If the normalization is [NormalizedText::NormalizedToSelf] it'll return the original text.
179	///
180	/// It will only return `None` if not normalization was applied.
181	pub fn get_normalized_text(&self) -> Option<&str> {
182		match &self.normalized_text {
183			NormalizedText::NotNormalized => None,
184			NormalizedText::NormalizedToSelf => Some(self.text),
185			NormalizedText::Normalized(text) => Some(text.as_str()),
186		}
187	}
188
189	/// Update this tokens normalized text with an unowned `&str`.
190	///
191	/// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
192	///
193	/// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
194	pub fn update_normalized_str(&mut self, normalized: &str, lang: Option<Lang>) {
195		if self.text == normalized {
196			self.normalized_text = NormalizedText::NormalizedToSelf;
197		} else {
198			self.normalized_text = NormalizedText::Normalized(normalized.to_owned())
199		}
200		self.update_normalization_language(lang);
201	}
202
203	/// Update this tokens normalized text with an owned `String`.
204	///
205	/// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
206	///
207	/// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
208	pub fn update_normalized_string(&mut self, normalized: String, lang: Option<Lang>) {
209		if self.text == normalized {
210			self.normalized_text = NormalizedText::NormalizedToSelf;
211		} else {
212			self.normalized_text = NormalizedText::Normalized(normalized);
213		}
214		self.update_normalization_language(lang);
215	}
216
217	/// Update the normalization language, `None` means languge independent
218	pub fn update_normalization_language(&mut self, lang: Option<Lang>) {
219		if lang.is_some() {
220			self.normalization_language = lang;
221		}
222	}
223
224	/// Returns wheather the text was normalized or not.
225	#[inline]
226	pub fn was_normalized(&self) -> bool {
227		!matches!(self.normalized_text, NormalizedText::NotNormalized)
228	}
229}
230
231impl<'a> From<&'a String> for SegmentedToken<'a> {
232	fn from(value: &'a String) -> Self {
233		Self::new(value, None)
234	}
235}
236
237impl<'a> From<&'a str> for SegmentedToken<'a> {
238	fn from(value: &'a str) -> Self {
239		Self::new(value, None)
240	}
241}
242
243/// What kind of content to expect from a [SegmentedToken].
244#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
245pub enum SegmentedTokenKind {
246	/// The token is a collection of any kind of letters and numbers
247	AlphaNumeric,
248
249	/// The token is some kind of seperator
250	Separator,
251
252	/// The token represents a symbol
253	Symbol,
254}
255
256/// Represents the outcomes of text normalization that can happen on a [SegmentedToken].
257#[derive(Debug, Clone, PartialEq, Eq, Default)]
258pub enum NormalizedText {
259	/// The token was not normalized, either it didn'T pass a normalization stage, or the normlaization stage didn't attempt normlaization (i.e. because no algorithm is implemented for a specific language)
260	#[default]
261	NotNormalized,
262
263	/// The token was normalized, but the result is the same as the original text
264	NormalizedToSelf,
265
266	/// The token was normalized into something that is not the original text.
267	Normalized(String),
268}
269
270impl From<NormalizedText> for Option<String> {
271	fn from(value: NormalizedText) -> Self {
272		if let NormalizedText::Normalized(text) = value {
273			Some(text)
274		} else {
275			None
276		}
277	}
278}