unobtanium_segmenter/segmented_token.rs
1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use whatlang::Lang;
6use whatlang::Script;
7
8/// The main representation of data this crate works on.
9///
10/// A token is effectively `text` with metadata attached, this struct being the metadata carrier.
11#[derive(Debug, Clone, PartialEq)]
12pub struct SegmentedToken<'a> {
13 /// The piece of text that this token represents.
14 ///
15 /// This should be borrowed from the initial text that was fed to the segmenter chain.
16 pub text: &'a str,
17
18 /// If a [normalizer](crate::normalization) output was different from `text` the result will be stored in here.
19 ///
20 /// It is recommended that you fetch normalized text using [get_text_prefer_normalized()][Self::get_text_prefer_normalized] or [get_text_prefer_normalized_owned()][Self::get_text_prefer_normalized_owned].
21 pub normalized_text: NormalizedText,
22
23 /// Which language the normalization that resulted in `normalized_text` happend with.
24 ///
25 /// `None` means that only language independent normalizations were applied, for language dependent normalizations this should make sure they're all applied for the same language.
26 ///
27 /// If already set to `Some` it shouldn't be changed.
28 pub normalization_language: Option<Lang>,
29
30 /// What kind of token this is.
31 ///
32 /// Set by:
33 /// * [NaiveWordSplitter][crate::segmentation::NaiveWordSplitter]
34 /// * [AugmentationClassify][crate::augmentation::AugmentationClassify]
35 pub kind: Option<SegmentedTokenKind>,
36
37 /// The primary script as detected by a script or language detection augmenter.
38 ///
39 /// Information about detected scripts is inherited across splitting.
40 pub detected_script: Option<Script>,
41
42 /// The primary language detected by a language detection augmenter.
43 ///
44 /// Information about detected languages in inherited across splitting.
45 pub detected_language: Option<Lang>,
46
47 /// How confident the language detector was about the language that it detectd.
48 ///
49 /// This scales inbetween `0` (not confident at all) and `1` (most confident).
50 pub detected_language_confidence: f64,
51
52 /// Wheter the language detector considers its output to be reliable.
53 pub is_detected_language_relible: bool,
54
55 /// Indicates that no further splitting is neccessary.
56 ///
57 /// This should be set to true if the token was a valid word in a dictionary.
58 pub is_known_word: bool,
59
60 /// Indicates that this token marks the end of a sentence.
61 ///
62 /// This should only be set on tokens with an empty `text` field. It is not inherited.
63 pub is_end_of_sentence: bool,
64}
65
66impl<'a> SegmentedToken<'a> {
67 /// Create a segmented token from scratch. (You likely won't need it)
68 ///
69 /// If you are wwriting a segmenter have a look at [new_derived_from()][Self::new_derived_from].
70 ///
71 /// For creating the initial token consider usng the `From` implementations or the [StartSegmentationChain][crate::chain::StartSegmentationChain] trait.
72 pub fn new(text: &'a str, kind: Option<SegmentedTokenKind>) -> Self {
73 let mut out = Self {
74 text,
75 kind,
76 normalized_text: NormalizedText::NotNormalized,
77 normalization_language: None,
78 is_known_word: false,
79 detected_script: None,
80 detected_language: None,
81 detected_language_confidence: 0.0,
82 is_detected_language_relible: false,
83 is_end_of_sentence: false,
84 };
85 match kind {
86 Some(SegmentedTokenKind::AlphaNumeric) => {
87 out.detected_script = whatlang::detect_script(text);
88 }
89 _ => { /* Do nothing */ }
90 }
91 return out;
92 }
93
94 /// Create a token with a given text that inerits metadata from the `from` token.
95 ///
96 /// This is the recommended constructor to use inside a segmenter after splitting.
97 pub fn new_derived_from(text: &'a str, from: &Self) -> Self {
98 Self {
99 text,
100 kind: None,
101 is_known_word: from.is_known_word,
102 detected_script: from.detected_script,
103 detected_language: from.detected_language,
104 detected_language_confidence: from.detected_language_confidence,
105 is_detected_language_relible: from.is_detected_language_relible,
106 normalized_text: NormalizedText::NotNormalized,
107 normalization_language: None,
108 is_end_of_sentence: false,
109 }
110 }
111
112 /// Create a new token that carries an `is_end_of_sentence` marker.
113 ///
114 /// Recommended way of deriving the empty text:
115 /// ```rust
116 /// # use unobtanium_segmenter::SegmentedToken;
117 /// # let token = SegmentedToken::new("Some example sentence to segment.", None);
118 /// # let sentence = token.text; // Actual segmenter goes here
119 /// let (main, tail) = sentence.split_at(sentence.len());
120 /// SegmentedToken::new_derived_from(main, &token);
121 /// SegmentedToken::new_end_of_sentence(tail);
122 /// ```
123 pub fn new_end_of_sentence(empty_text: &'a str) -> Self {
124 let mut new = Self::new(empty_text, None);
125 new.is_end_of_sentence = true;
126 return new;
127 }
128
129 /// Helper function to convert texts that came ot of a simple helper function
130 /// back into segments.
131 ///
132 /// Using this implies that further segmenting didn't change anything
133 /// for the metadta of the child segments.
134 pub fn covert_to_child_segements_of_self(
135 &'a self,
136 texts: &'a [&'a str],
137 ) -> impl Iterator<Item = SegmentedToken<'a>> + 'a {
138 texts.iter().map(|text| Self::new_derived_from(text, self))
139 }
140
141 /// Builder like convenience function to set the `is_known_word` flag.
142 #[inline(always)]
143 pub fn with_is_kown_word(mut self, is_known_word: bool) -> Self {
144 self.is_known_word = is_known_word;
145 return self;
146 }
147
148 /// Builder like convenience function to set the detected language.
149 #[inline(always)]
150 pub fn with_detected_language(
151 mut self,
152 lang: Option<Lang>,
153 is_relible: bool,
154 confidence: f64,
155 ) -> Self {
156 self.detected_language = lang;
157 self.is_detected_language_relible = is_relible;
158 self.detected_language_confidence = confidence;
159 self
160 }
161
162 /// Return the `normalized_text` of this token if present and `text` if not as a `str`.
163 pub fn get_text_prefer_normalized(&self) -> &str {
164 if let NormalizedText::Normalized(normalized_text) = &self.normalized_text {
165 return normalized_text.as_str();
166 } else {
167 return self.text;
168 }
169 }
170
171 /// This is the same as [get_text_prefer_normalized()][Self::get_text_prefer_normalized], but returns an owned String instead.
172 pub fn get_text_prefer_normalized_owned(&self) -> String {
173 self.get_text_prefer_normalized().to_string()
174 }
175
176 /// Returns the normalized text behind this token.
177 ///
178 /// If the normalization is [NormalizedText::NormalizedToSelf] it'll return the original text.
179 ///
180 /// It will only return `None` if not normalization was applied.
181 pub fn get_normalized_text(&self) -> Option<&str> {
182 match &self.normalized_text {
183 NormalizedText::NotNormalized => None,
184 NormalizedText::NormalizedToSelf => Some(self.text),
185 NormalizedText::Normalized(text) => Some(text.as_str()),
186 }
187 }
188
189 /// Update this tokens normalized text with an unowned `&str`.
190 ///
191 /// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
192 ///
193 /// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
194 pub fn update_normalized_str(&mut self, normalized: &str, lang: Option<Lang>) {
195 if self.text == normalized {
196 self.normalized_text = NormalizedText::NormalizedToSelf;
197 } else {
198 self.normalized_text = NormalizedText::Normalized(normalized.to_owned())
199 }
200 self.update_normalization_language(lang);
201 }
202
203 /// Update this tokens normalized text with an owned `String`.
204 ///
205 /// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
206 ///
207 /// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
208 pub fn update_normalized_string(&mut self, normalized: String, lang: Option<Lang>) {
209 if self.text == normalized {
210 self.normalized_text = NormalizedText::NormalizedToSelf;
211 } else {
212 self.normalized_text = NormalizedText::Normalized(normalized);
213 }
214 self.update_normalization_language(lang);
215 }
216
217 /// Update the normalization language, `None` means languge independent
218 pub fn update_normalization_language(&mut self, lang: Option<Lang>) {
219 if lang.is_some() {
220 self.normalization_language = lang;
221 }
222 }
223
224 /// Returns wheather the text was normalized or not.
225 #[inline]
226 pub fn was_normalized(&self) -> bool {
227 !matches!(self.normalized_text, NormalizedText::NotNormalized)
228 }
229}
230
231impl<'a> From<&'a String> for SegmentedToken<'a> {
232 fn from(value: &'a String) -> Self {
233 Self::new(value, None)
234 }
235}
236
237impl<'a> From<&'a str> for SegmentedToken<'a> {
238 fn from(value: &'a str) -> Self {
239 Self::new(value, None)
240 }
241}
242
243/// What kind of content to expect from a [SegmentedToken].
244#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
245pub enum SegmentedTokenKind {
246 /// The token is a collection of any kind of letters and numbers
247 AlphaNumeric,
248
249 /// The token is some kind of seperator
250 Separator,
251
252 /// The token represents a symbol
253 Symbol,
254}
255
256/// Represents the outcomes of text normalization that can happen on a [SegmentedToken].
257#[derive(Debug, Clone, PartialEq, Eq, Default)]
258pub enum NormalizedText {
259 /// The token was not normalized, either it didn'T pass a normalization stage, or the normlaization stage didn't attempt normlaization (i.e. because no algorithm is implemented for a specific language)
260 #[default]
261 NotNormalized,
262
263 /// The token was normalized, but the result is the same as the original text
264 NormalizedToSelf,
265
266 /// The token was normalized into something that is not the original text.
267 Normalized(String),
268}
269
270impl From<NormalizedText> for Option<String> {
271 fn from(value: NormalizedText) -> Self {
272 if let NormalizedText::Normalized(text) = value {
273 Some(text)
274 } else {
275 None
276 }
277 }
278}