unobtanium_segmenter/segmented_token.rs
1// SPDX-FileCopyrightText: 2026 Slatian
2//
3// SPDX-License-Identifier: LGPL-3.0-only
4
5use whatlang::Lang;
6use whatlang::Script;
7
8/// The main representation of data this crate works on.
9///
10/// A token is effectively `text` with metadata attached, this struct being the metadata carrier.
11#[derive(Debug, Clone, PartialEq)]
12pub struct SegmentedToken<'a> {
13 /// The piece of text that this token represents.
14 ///
15 /// This should be borrowed from the initial text that was fed to the segmenter chain.
16 pub text: &'a str,
17
18 /// If a [normalizer](crate::normalization) output was different from `text` the result will be stored in here.
19 ///
20 /// It is recommended that you fetch normalized text using [get_text_prefer_normalized()][Self::get_text_prefer_normalized] or [get_text_prefer_normalized_owned()][Self::get_text_prefer_normalized_owned].
21 pub normalized_text: NormalizedText,
22
23 /// Which language the normalization that resulted in `normalized_text` happend with.
24 ///
25 /// `None` means that only language independent normalizations were applied, for language dependent normalizations this should make sure they're all applied for the same language.
26 ///
27 /// If already set to `Some` it shouldn't be changed.
28 pub normalization_language: Option<Lang>,
29
30 /// What kind of token this is.
31 ///
32 /// Set by:
33 /// * [NaiveWordSplitter][crate::segmentation::NaiveWordSplitter]
34 /// * [AugmentationClassify][crate::augmentation::AugmentationClassify]
35 pub kind: Option<SegmentedTokenKind>,
36
37 /// The primary script as detected by a script or language detection augmenter.
38 ///
39 /// Information about detected scripts is inherited across splitting.
40 pub detected_script: Option<Script>,
41
42 /// The primary language detected by a language detection augmenter.
43 ///
44 /// Information about detected languages in inherited across splitting.
45 pub detected_language: Option<Lang>,
46
47 /// How confident the language detector was about the language that it detectd.
48 ///
49 /// This scales inbetween `0` (not confident at all) and `1` (most confident).
50 pub detected_language_confidence: f64,
51
52 /// Wheter the language detector considers its output to be reliable.
53 pub is_detected_language_relible: bool,
54
55 /// Indicates that no further splitting is neccessary.
56 ///
57 /// This should be set to true if the token was a valid word in a dictionary.
58 pub is_known_word: bool,
59
60 /// Indicates that this token marks the end of a sentence.
61 ///
62 /// This should only be set on tokens with an empty `text` field. It is not inherited.
63 pub is_end_of_sentence: bool,
64}
65
66impl<'a> SegmentedToken<'a> {
67 /// Create a segmented token from scratch. (You likely won't need it)
68 ///
69 /// If you are wwriting a segmenter have a look at [new_derived_from()][Self::new_derived_from].
70 ///
71 /// For creating the initial token consider usng the `From` implementations or the [StartSegmentationChain][crate::chain::StartSegmentationChain] trait.
72 pub fn new(text: &'a str, kind: Option<SegmentedTokenKind>) -> Self {
73 let mut out = Self {
74 text,
75 kind,
76 normalized_text: NormalizedText::NotNormalized,
77 normalization_language: None,
78 is_known_word: false,
79 detected_script: None,
80 detected_language: None,
81 detected_language_confidence: 0.0,
82 is_detected_language_relible: false,
83 is_end_of_sentence: false,
84 };
85 match kind {
86 Some(SegmentedTokenKind::AlphaNumeric) => {
87 out.detected_script = whatlang::detect_script(text);
88 }
89 _ => { /* Do nothing */ }
90 }
91 return out;
92 }
93
94 /// Create a token with a given text that inerits metadata from the `from` token.
95 ///
96 /// This is the recommended constructor to use inside a segmenter after splitting.
97 pub fn new_derived_from(text: &'a str, from: &Self) -> Self {
98 Self {
99 text,
100 kind: None,
101 is_known_word: from.is_known_word,
102 detected_script: from.detected_script,
103 detected_language: from.detected_language,
104 detected_language_confidence: from.detected_language_confidence,
105 is_detected_language_relible: from.is_detected_language_relible,
106 normalized_text: NormalizedText::NotNormalized,
107 normalization_language: None,
108 is_end_of_sentence: false,
109 }
110 }
111
112 /// Create a new token that carries an `is_end_of_sentence` marker.
113 ///
114 /// Recommended way of deriving the empty text:
115 /// ```rust
116 /// # use unobtanium_segmenter::SegmentedToken;
117 /// # let token = SegmentedToken::new("Some example sentence to segment.", None);
118 /// # let sentence = token.text; // Actual segmenter goes here
119 /// let (main, tail) = sentence.split_at(sentence.len());
120 /// SegmentedToken::new_derived_from(main, &token);
121 /// SegmentedToken::new_end_of_sentence(tail);
122 /// ```
123 pub fn new_end_of_sentence(empty_text: &'a str) -> Self {
124 let mut new = Self::new(empty_text, None);
125 new.is_end_of_sentence = true;
126 return new;
127 }
128
129 /// Helper function to convert texts that came ot of a simple helper function
130 /// back into segments.
131 ///
132 /// Using this implies that further segmenting didn't change anything
133 /// for the metadta of the child segments.
134 pub fn covert_to_child_segements_of_self(
135 &'a self,
136 texts: &'a [&'a str],
137 ) -> impl Iterator<Item = SegmentedToken<'a>> + 'a {
138 texts.iter().map(|text| Self::new_derived_from(text, self))
139 }
140
141 /// Builder like convenience function to set the `is_known_word` flag.
142 #[inline(always)]
143 pub fn with_is_kown_word(mut self, is_known_word: bool) -> Self {
144 self.is_known_word = is_known_word;
145 return self;
146 }
147
148 /// Return the `normalized_text` of this token if present and `text` if not as a `str`.
149 pub fn get_text_prefer_normalized(&self) -> &str {
150 if let NormalizedText::Normalized(normalized_text) = &self.normalized_text {
151 return normalized_text.as_str();
152 } else {
153 return self.text;
154 }
155 }
156
157 /// This is the same as [get_text_prefer_normalized()][Self::get_text_prefer_normalized], but returns an owned String instead.
158 pub fn get_text_prefer_normalized_owned(&self) -> String {
159 self.get_text_prefer_normalized().to_string()
160 }
161
162 /// Returns the normalized text behind this token.
163 ///
164 /// If the normalization is [NormalizedText::NormalizedToSelf] it'll return the original text.
165 ///
166 /// It will only return `None` if not normalization was applied.
167 pub fn get_normalized_text(&self) -> Option<&str> {
168 match &self.normalized_text {
169 NormalizedText::NotNormalized => None,
170 NormalizedText::NormalizedToSelf => Some(self.text),
171 NormalizedText::Normalized(text) => Some(text.as_str()),
172 }
173 }
174
175 /// Update this tokens normalized text with an unowned `&str`.
176 ///
177 /// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
178 ///
179 /// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
180 pub fn update_normalized_str(&mut self, normalized: &str, lang: Option<Lang>) {
181 if self.text == normalized {
182 self.normalized_text = NormalizedText::NormalizedToSelf;
183 } else {
184 self.normalized_text = NormalizedText::Normalized(normalized.to_owned())
185 }
186 self.update_normalization_language(lang);
187 }
188
189 /// Update this tokens normalized text with an owned `String`.
190 ///
191 /// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
192 ///
193 /// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
194 pub fn update_normalized_string(&mut self, normalized: String, lang: Option<Lang>) {
195 if self.text == normalized {
196 self.normalized_text = NormalizedText::NormalizedToSelf;
197 } else {
198 self.normalized_text = NormalizedText::Normalized(normalized);
199 }
200 self.update_normalization_language(lang);
201 }
202
203 /// Update the normalization language, `None` means languge independent
204 pub fn update_normalization_language(&mut self, lang: Option<Lang>) {
205 if lang.is_some() {
206 self.normalization_language = lang;
207 }
208 }
209
210 /// Returns wheather the text was normalized or not.
211 #[inline]
212 pub fn was_normalized(&self) -> bool {
213 !matches!(self.normalized_text, NormalizedText::NotNormalized)
214 }
215}
216
217impl<'a> From<&'a String> for SegmentedToken<'a> {
218 fn from(value: &'a String) -> Self {
219 Self::new(value, None)
220 }
221}
222
223impl<'a> From<&'a str> for SegmentedToken<'a> {
224 fn from(value: &'a str) -> Self {
225 Self::new(value, None)
226 }
227}
228
229/// What kind of content to expect from a [SegmentedToken].
230#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
231pub enum SegmentedTokenKind {
232 /// The token is a collection of any kind of letters and numbers
233 AlphaNumeric,
234
235 /// The token is some kind of seperator
236 Separator,
237
238 /// The token represents a symbol
239 Symbol,
240}
241
242/// Represents the outcomes of text normalization that can happen on a [SegmentedToken].
243#[derive(Debug, Clone, PartialEq, Eq, Default)]
244pub enum NormalizedText {
245 /// The token was not normalized, either it didn'T pass a normalization stage, or the normlaization stage didn't attempt normlaization (i.e. because no algorithm is implemented for a specific language)
246 #[default]
247 NotNormalized,
248
249 /// The token was normalized, but the result is the same as the original text
250 NormalizedToSelf,
251
252 /// The token was normalized into something that is not the original text.
253 Normalized(String),
254}
255
256impl From<NormalizedText> for Option<String> {
257 fn from(value: NormalizedText) -> Self {
258 if let NormalizedText::Normalized(text) = value {
259 Some(text)
260 } else {
261 None
262 }
263 }
264}