1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
// SPDX-FileCopyrightText: 2026 Slatian
//
// SPDX-License-Identifier: LGPL-3.0-only
use whatlang::Lang;
use whatlang::Script;
/// The main representation of data this crate works on.
///
/// A token is effectively `text` with metadata attached, this struct being the metadata carrier.
#[derive(Debug, Clone, PartialEq)]
pub struct SegmentedToken<'a> {
/// The piece of text that this token represents.
///
/// This should be borrowed from the initial text that was fed to the segmenter chain.
pub text: &'a str,
/// If a [normalizer](crate::normalization) output was different from `text` the result will be stored in here.
///
/// It is recommended that you fetch normalized text using [get_text_prefer_normalized()][Self::get_text_prefer_normalized] or [get_text_prefer_normalized_owned()][Self::get_text_prefer_normalized_owned].
pub normalized_text: NormalizedText,
/// Which language the normalization that resulted in `normalized_text` happend with.
///
/// `None` means that only language independent normalizations were applied, for language dependent normalizations this should make sure they're all applied for the same language.
///
/// If already set to `Some` it shouldn't be changed.
pub normalization_language: Option<Lang>,
/// What kind of token this is.
///
/// Set by:
/// * [NaiveWordSplitter][crate::segmentation::NaiveWordSplitter]
/// * [AugmentationClassify][crate::augmentation::AugmentationClassify]
pub kind: Option<SegmentedTokenKind>,
/// The primary script as detected by a script or language detection augmenter.
///
/// Information about detected scripts is inherited across splitting.
pub detected_script: Option<Script>,
/// The primary language detected by a language detection augmenter.
///
/// Information about detected languages in inherited across splitting.
pub detected_language: Option<Lang>,
/// How confident the language detector was about the language that it detectd.
///
/// This scales inbetween `0` (not confident at all) and `1` (most confident).
pub detected_language_confidence: f64,
/// Wheter the language detector considers its output to be reliable.
pub is_detected_language_relible: bool,
/// Indicates that no further splitting is neccessary.
///
/// This should be set to true if the token was a valid word in a dictionary.
pub is_known_word: bool,
/// Indicates that this token marks the end of a sentence.
///
/// This should only be set on tokens with an empty `text` field. It is not inherited.
pub is_end_of_sentence: bool,
}
impl<'a> SegmentedToken<'a> {
/// Create a segmented token from scratch. (You likely won't need it)
///
/// If you are wwriting a segmenter have a look at [new_derived_from()][Self::new_derived_from].
///
/// For creating the initial token consider usng the `From` implementations or the [StartSegmentationChain][crate::chain::StartSegmentationChain] trait.
pub fn new(text: &'a str, kind: Option<SegmentedTokenKind>) -> Self {
let mut out = Self {
text,
kind,
normalized_text: NormalizedText::NotNormalized,
normalization_language: None,
is_known_word: false,
detected_script: None,
detected_language: None,
detected_language_confidence: 0.0,
is_detected_language_relible: false,
is_end_of_sentence: false,
};
match kind {
Some(SegmentedTokenKind::AlphaNumeric) => {
out.detected_script = whatlang::detect_script(text);
}
_ => { /* Do nothing */ }
}
return out;
}
/// Create a token with a given text that inerits metadata from the `from` token.
///
/// This is the recommended constructor to use inside a segmenter after splitting.
pub fn new_derived_from(text: &'a str, from: &Self) -> Self {
Self {
text,
kind: None,
is_known_word: from.is_known_word,
detected_script: from.detected_script,
detected_language: from.detected_language,
detected_language_confidence: from.detected_language_confidence,
is_detected_language_relible: from.is_detected_language_relible,
normalized_text: NormalizedText::NotNormalized,
normalization_language: None,
is_end_of_sentence: false,
}
}
/// Create a new token that carries an `is_end_of_sentence` marker.
///
/// Recommended way of deriving the empty text:
/// ```rust
/// # use unobtanium_segmenter::SegmentedToken;
/// # let token = SegmentedToken::new("Some example sentence to segment.", None);
/// # let sentence = token.text; // Actual segmenter goes here
/// let (main, tail) = sentence.split_at(sentence.len());
/// SegmentedToken::new_derived_from(main, &token);
/// SegmentedToken::new_end_of_sentence(tail);
/// ```
pub fn new_end_of_sentence(empty_text: &'a str) -> Self {
let mut new = Self::new(empty_text, None);
new.is_end_of_sentence = true;
return new;
}
/// Helper function to convert texts that came ot of a simple helper function
/// back into segments.
///
/// Using this implies that further segmenting didn't change anything
/// for the metadta of the child segments.
pub fn covert_to_child_segements_of_self(
&'a self,
texts: &'a [&'a str],
) -> impl Iterator<Item = SegmentedToken<'a>> + 'a {
texts.iter().map(|text| Self::new_derived_from(text, self))
}
/// Builder like convenience function to set the `is_known_word` flag.
#[inline(always)]
pub fn with_is_kown_word(mut self, is_known_word: bool) -> Self {
self.is_known_word = is_known_word;
return self;
}
/// Builder like convenience function to set the detected language.
#[inline(always)]
pub fn with_detected_language(
mut self,
lang: Option<Lang>,
is_relible: bool,
confidence: f64,
) -> Self {
self.detected_language = lang;
self.is_detected_language_relible = is_relible;
self.detected_language_confidence = confidence;
self
}
/// Return the `normalized_text` of this token if present and `text` if not as a `str`.
pub fn get_text_prefer_normalized(&self) -> &str {
if let NormalizedText::Normalized(normalized_text) = &self.normalized_text {
return normalized_text.as_str();
} else {
return self.text;
}
}
/// This is the same as [get_text_prefer_normalized()][Self::get_text_prefer_normalized], but returns an owned String instead.
pub fn get_text_prefer_normalized_owned(&self) -> String {
self.get_text_prefer_normalized().to_string()
}
/// Returns the normalized text behind this token.
///
/// If the normalization is [NormalizedText::NormalizedToSelf] it'll return the original text.
///
/// It will only return `None` if not normalization was applied.
pub fn get_normalized_text(&self) -> Option<&str> {
match &self.normalized_text {
NormalizedText::NotNormalized => None,
NormalizedText::NormalizedToSelf => Some(self.text),
NormalizedText::Normalized(text) => Some(text.as_str()),
}
}
/// Update this tokens normalized text with an unowned `&str`.
///
/// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
///
/// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
pub fn update_normalized_str(&mut self, normalized: &str, lang: Option<Lang>) {
if self.text == normalized {
self.normalized_text = NormalizedText::NormalizedToSelf;
} else {
self.normalized_text = NormalizedText::Normalized(normalized.to_owned())
}
self.update_normalization_language(lang);
}
/// Update this tokens normalized text with an owned `String`.
///
/// If the text already matches the unnormalized text, the `normalized_text` will be set to [NormalizedText::NormalizedToSelf].
///
/// `lang` is the language that the normalization happend for, set to `None` if the normalization was language independent.
pub fn update_normalized_string(&mut self, normalized: String, lang: Option<Lang>) {
if self.text == normalized {
self.normalized_text = NormalizedText::NormalizedToSelf;
} else {
self.normalized_text = NormalizedText::Normalized(normalized);
}
self.update_normalization_language(lang);
}
/// Update the normalization language, `None` means languge independent
pub fn update_normalization_language(&mut self, lang: Option<Lang>) {
if lang.is_some() {
self.normalization_language = lang;
}
}
/// Returns wheather the text was normalized or not.
#[inline]
pub fn was_normalized(&self) -> bool {
!matches!(self.normalized_text, NormalizedText::NotNormalized)
}
}
impl<'a> From<&'a String> for SegmentedToken<'a> {
fn from(value: &'a String) -> Self {
Self::new(value, None)
}
}
impl<'a> From<&'a str> for SegmentedToken<'a> {
fn from(value: &'a str) -> Self {
Self::new(value, None)
}
}
/// What kind of content to expect from a [SegmentedToken].
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SegmentedTokenKind {
/// The token is a collection of any kind of letters and numbers
AlphaNumeric,
/// The token is some kind of seperator
Separator,
/// The token represents a symbol
Symbol,
}
/// Represents the outcomes of text normalization that can happen on a [SegmentedToken].
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub enum NormalizedText {
/// The token was not normalized, either it didn'T pass a normalization stage, or the normlaization stage didn't attempt normlaization (i.e. because no algorithm is implemented for a specific language)
#[default]
NotNormalized,
/// The token was normalized, but the result is the same as the original text
NormalizedToSelf,
/// The token was normalized into something that is not the original text.
Normalized(String),
}
impl From<NormalizedText> for Option<String> {
fn from(value: NormalizedText) -> Self {
if let NormalizedText::Normalized(text) = value {
Some(text)
} else {
None
}
}
}