unobtanium_segmenter/
segmented_token.rs

1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use whatlang::Lang;
6use whatlang::Script;
7
8/// The main representation of data this crate works on.
9///
10/// A token is effectively `text` with metadata attached, this struct being the metadata carrier.
11#[derive(Debug, Clone, PartialEq)]
12pub struct SegmentedToken<'a> {
13	/// The piece of text that this token represents.
14	///
15	/// This should be borrowed from the initial text that was fed to the segmenter chain.
16	pub text: &'a str,
17
18	/// If a [normalizer](crate::normalization) output was different from `text` the result will be stored in here.
19	///
20	/// It is recommended that you fetch normalized text using [get_text_prefer_normalized()][Self::get_text_prefer_normalized] or [get_text_prefer_normalized_owned()][Self::get_text_prefer_normalized_owned].
21	pub normalized_text: NormalizedText,
22
23	/// Which language the normalization that resulted in `normalized_text` happend with.
24	///
25	/// `None` means that only language independent normalizations were applied, for language dependent normalizations this should make sure they're all applied for the same language.
26	///
27	/// If already set to `Some` it shouldn't be changed.
28	pub normalization_language: Option<Lang>,
29
30	/// What kind of token this is.
31	///
32	/// Set by:
33	/// * [NaiveWordSplitter][crate::segmentation::NaiveWordSplitter]
34	/// * [AugmentationClassify][crate::augmentation::AugmentationClassify]
35	pub kind: Option<SegmentedTokenKind>,
36
37	/// The primary script as detected by a script or language detection augmenter.
38	///
39	/// Information about detected scripts is inherited across splitting.
40	pub detected_script: Option<Script>,
41
42	/// The primary language detected by a language detection augmenter.
43	///
44	/// Information about detected languages in inherited across splitting.
45	pub detected_language: Option<Lang>,
46
47	/// How confident the language detector was about the language that it detectd.
48	///
49	/// This scales inbetween `0` (not confident at all) and `1` (most confident).
50	pub detected_language_confidence: f64,
51
52	/// Wheter the language detector considers its output to be reliable.
53	pub is_detected_language_relible: bool,
54
55	/// Indicates that no further splitting is neccessary.
56	///
57	/// This should be set to true if the token was a valid word in a dictionary.
58	pub is_known_word: bool,
59
60	/// Indicates that this token marks the end of a sentence.
61	///
62	/// This should only be set on tokens with an empty `text` field. It is not inherited.
63	pub is_end_of_sentence: bool,
64}
65
66impl<'a> SegmentedToken<'a> {
67	/// Create a segmented token from scratch. (You likely won't need it)
68	///
69	/// If you are wwriting a segmenter have a look at [new_derived_from()][Self::new_derived_from].
70	///
71	/// For creating the initial token consider usng the `From` implementations or the [StartSegmentationChain][crate::chain::StartSegmentationChain] trait.
72	pub fn new(text: &'a str, kind: Option<SegmentedTokenKind>) -> Self {
73		let mut out = Self {
74			text,
75			kind,
76			normalized_text: NormalizedText::NotNormalized,
77			normalization_language: None,
78			is_known_word: false,
79			detected_script: None,
80			detected_language: None,
81			detected_language_confidence: 0.0,
82			is_detected_language_relible: false,
83			is_end_of_sentence: false,
84		};
85		match kind {
86			Some(SegmentedTokenKind::AlphaNumeric) => {
87				out.detected_script = whatlang::detect_script(text);
88			}
89			_ => { /* Do nothing */ }
90		}
91		return out;
92	}
93
94	/// Create a token with a given text that inerits metadata from the `from` token.
95	///
96	/// This is the recommended constructor to use inside a segmenter after splitting.
97	pub fn new_derived_from(text: &'a str, from: &Self) -> Self {
98		Self {
99			text,
100			kind: None,
101			is_known_word: from.is_known_word,
102			detected_script: from.detected_script,
103			detected_language: from.detected_language,
104			detected_language_confidence: from.detected_language_confidence,
105			is_detected_language_relible: from.is_detected_language_relible,
106			normalized_text: NormalizedText::NotNormalized,
107			normalization_language: None,
108			is_end_of_sentence: false,
109		}
110	}
111
112	/// Create a new token that carries an `is_end_of_sentence` marker.
113	///
114	/// Recommended way of deriving the empty text:
115	/// ```rust
116	/// # use unobtanium_segmenter::SegmentedToken;
117	/// # let token = SegmentedToken::new("Some example sentence to segment.", None);
118	/// # let sentence = token.text; // Actual segmenter goes here
119	/// let (main, tail) = sentence.split_at(sentence.len());
120	/// SegmentedToken::new_derived_from(main, &token);
121	/// SegmentedToken::new_end_of_sentence(tail);
122	/// ```
123	pub fn new_end_of_sentence(empty_text: &'a str) -> Self {
124		let mut new = Self::new(empty_text, None);
125		new.is_end_of_sentence = true;
126		return new;
127	}
128
129	/// Helper function to convert texts that came ot of a simple helper function
130	/// back into segments.
131	///
132	/// Using this implies that further segmenting didn't change anything
133	/// for the metadta of the child segments.
134	pub fn covert_to_child_segements_of_self(
135		&'a self,
136		texts: &'a [&'a str],
137	) -> impl Iterator<Item = SegmentedToken<'a>> + 'a {
138		texts.iter().map(|text| Self::new_derived_from(text, self))
139	}
140
141	/// Builder like convenience function to set the `is_known_word` flag.
142	#[inline(always)]
143	pub fn with_is_kown_word(mut self, is_known_word: bool) -> Self {
144		self.is_known_word = is_known_word;
145		return self;
146	}
147
148	/// Return the `normalized_text` of this token if present and `text` if not as a `str`.
149	pub fn get_text_prefer_normalized(&self) -> &str {
150		if let NormalizedText::Normalized(normalized_text) = &self.normalized_text {
151			return normalized_text.as_str();
152		} else {
153			return self.text;
154		}
155	}
156
157	/// This is the same as [get_text_prefer_normalized()][Self::get_text_prefer_normalized], but returns an owned String instead.
158	pub fn get_text_prefer_normalized_owned(&self) -> String {
159		self.get_text_prefer_normalized().to_string()
160	}
161
162	/// Returns the normalized text behind this token.
163	///
164	/// If the normalization is [NormalizedText::NormalizedToSelf] it'll return the original text.
165	///
166	/// It will only return `None` if not normalization was applied.
167	pub fn get_normalized_text(&self) -> Option<&str> {
168		match &self.normalized_text {
169			NormalizedText::NotNormalized => None,
170			NormalizedText::NormalizedToSelf => Some(self.text),
171			NormalizedText::Normalized(text) => Some(text.as_str()),
172		}
173	}
174
175	/// Update this tokens normalized text with an unowned `&str`.
176	///
177	/// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
178	///
179	/// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
180	pub fn update_normalized_str(&mut self, normalized: &str, lang: Option<Lang>) {
181		if self.text == normalized {
182			self.normalized_text = NormalizedText::NormalizedToSelf;
183		} else {
184			self.normalized_text = NormalizedText::Normalized(normalized.to_owned())
185		}
186		self.update_normalization_language(lang);
187	}
188
189	/// Update this tokens normalized text with an owned `String`.
190	///
191	/// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
192	///
193	/// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
194	pub fn update_normalized_string(&mut self, normalized: String, lang: Option<Lang>) {
195		if self.text == normalized {
196			self.normalized_text = NormalizedText::NormalizedToSelf;
197		} else {
198			self.normalized_text = NormalizedText::Normalized(normalized);
199		}
200		self.update_normalization_language(lang);
201	}
202
203	/// Update the normalization language, `None` means languge independent
204	pub fn update_normalization_language(&mut self, lang: Option<Lang>) {
205		if lang.is_some() {
206			self.normalization_language = lang;
207		}
208	}
209
210	/// Returns wheather the text was normalized or not.
211	#[inline]
212	pub fn was_normalized(&self) -> bool {
213		!matches!(self.normalized_text, NormalizedText::NotNormalized)
214	}
215}
216
217impl<'a> From<&'a String> for SegmentedToken<'a> {
218	fn from(value: &'a String) -> Self {
219		Self::new(value, None)
220	}
221}
222
223impl<'a> From<&'a str> for SegmentedToken<'a> {
224	fn from(value: &'a str) -> Self {
225		Self::new(value, None)
226	}
227}
228
229/// What kind of content to expect from a [SegmentedToken].
230#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
231pub enum SegmentedTokenKind {
232	/// The token is a collection of any kind of letters and numbers
233	AlphaNumeric,
234
235	/// The token is some kind of seperator
236	Separator,
237
238	/// The token represents a symbol
239	Symbol,
240}
241
242/// Represents the outcomes of text normalization that can happen on a [SegmentedToken].
243#[derive(Debug, Clone, PartialEq, Eq, Default)]
244pub enum NormalizedText {
245	/// The token was not normalized, either it didn'T pass a normalization stage, or the normlaization stage didn't attempt normlaization (i.e. because no algorithm is implemented for a specific language)
246	#[default]
247	NotNormalized,
248
249	/// The token was normalized, but the result is the same as the original text
250	NormalizedToSelf,
251
252	/// The token was normalized into something that is not the original text.
253	Normalized(String),
254}
255
256impl From<NormalizedText> for Option<String> {
257	fn from(value: NormalizedText) -> Self {
258		if let NormalizedText::Normalized(text) = value {
259			Some(text)
260		} else {
261			None
262		}
263	}
264}