unobtanium_segmenter/segmented_token.rs
1use whatlang::Lang;
2use whatlang::Script;
3
4/// The main representation of data this crate works on.
5///
6/// A token is effectively `text` with metadata attached, this struct being the metadata carrier.
7#[derive(Debug, Clone, PartialEq)]
8pub struct SegmentedToken<'a> {
9 /// The piece of text that this token represents.
10 ///
11 /// This should be borrowed from the initial text that was fed to the segmenter chain.
12 pub text: &'a str,
13
14 /// If a [normalizer](crate::normalization) output was different from `text` the result will be stored in here.
15 ///
16 /// It is recommended that you fetch normalized text using [get_text_prefer_normalized()][Self::get_text_prefer_normalized] or [get_text_prefer_normalized_owned()][Self::get_text_prefer_normalized_owned].
17 pub normalized_text: Option<String>,
18
19 /// What kind of token this is.
20 ///
21 /// Set by:
22 /// * [NaiveWordSplitter][crate::segmentation::NaiveWordSplitter]
23 /// * [AugmentationClassify][crate::augmentation::AugmentationClassify]
24 pub kind: Option<SegmentedTokenKind>,
25
26 /// The primary script as detected by a script or language detection augmenter.
27 ///
28 /// Information about detected scripts is inherited across splitting.
29 pub detected_script: Option<Script>,
30
31 /// The primary language detected by a language detection augmenter.
32 ///
33 /// Information about detected languages in inherited across splitting.
34 pub detected_language: Option<Lang>,
35
36 /// How confident the language detector was about the language that it detectd.
37 ///
38 /// This scales inbetween `0` (not confident at all) and `1` (most confident).
39 pub detected_language_confidence: f64,
40
41 /// Wheter the language detector considers its output to be reliable.
42 pub is_detected_language_relible: bool,
43
44 /// Indicates that no further splitting is neccessary.
45 ///
46 /// This should be set to true if the token was a valid word in a dictionary.
47 pub is_known_word: bool,
48
49 /// Indicates that this token marks the end of a sentence.
50 ///
51 /// This should only be set on tokens with an empty `text` field. It is not inherited.
52 pub is_end_of_sentence: bool,
53 //TODO: Add useful flags here
54 // contains_emoji: bool,
55}
56
57impl<'a> SegmentedToken<'a> {
58 /// Create a segmented token from scratch. (You likely won't need it)
59 ///
60 /// If you are wwriting a segmenter have a look at [new_derived_from()][Self::new_derived_from].
61 ///
62 /// For creating the initial token consider usng the `From` implementations or the [StartSegmentationChain][crate::chain::StartSegmentationChain] trait.
63 pub fn new(text: &'a str, kind: Option<SegmentedTokenKind>) -> Self {
64 let mut out = Self {
65 text,
66 kind,
67 normalized_text: None,
68 is_known_word: false,
69 detected_script: None,
70 detected_language: None,
71 detected_language_confidence: 0.0,
72 is_detected_language_relible: false,
73 is_end_of_sentence: false,
74 };
75 match kind {
76 Some(SegmentedTokenKind::AlphaNumeric) => {
77 out.detected_script = whatlang::detect_script(text);
78 }
79 _ => { /* Do nothing */ }
80 }
81 return out;
82 }
83
84 /// Create a token with a given text that inerits metadata from the `from` token.
85 ///
86 /// This is the recommended constructor to use inside a segmenter after splitting.
87 pub fn new_derived_from(text: &'a str, from: &Self) -> Self {
88 Self {
89 text,
90 kind: None,
91 is_known_word: from.is_known_word,
92 detected_script: from.detected_script,
93 detected_language: from.detected_language,
94 detected_language_confidence: from.detected_language_confidence,
95 is_detected_language_relible: from.is_detected_language_relible,
96 normalized_text: None,
97 is_end_of_sentence: false,
98 }
99 }
100
101 /// Create a new token that carries an `is_end_of_sentence` marker.
102 ///
103 /// Recommended way of deriving the empty text:
104 /// ```rust
105 /// # use unobtanium_segmenter::SegmentedToken;
106 /// # let token = SegmentedToken::new("Some example sentence to segment.", None);
107 /// # let sentence = token.text; // Actual segmenter goes here
108 /// let (main, tail) = sentence.split_at(sentence.len());
109 /// SegmentedToken::new_derived_from(main, &token);
110 /// SegmentedToken::new_end_of_sentence(tail);
111 /// ```
112 pub fn new_end_of_sentence(empty_text: &'a str) -> Self {
113 let mut new = Self::new(empty_text, None);
114 new.is_end_of_sentence = true;
115 return new;
116 }
117
118 /// Helper function to convert texts that came ot of a simple helper function
119 /// back into segments.
120 ///
121 /// Using this implies that further segmenting didn't change anything
122 /// for the metadta of the child segments.
123 pub fn covert_to_child_segements_of_self(
124 &'a self,
125 texts: &'a [&'a str],
126 ) -> impl Iterator<Item = SegmentedToken<'a>> + 'a {
127 texts.iter().map(|text| Self::new_derived_from(text, self))
128 }
129
130 /// Builder like convenience function to set the `is_known_word` flag.
131 #[inline(always)]
132 pub fn with_is_kown_word(mut self, is_known_word: bool) -> Self {
133 self.is_known_word = is_known_word;
134 return self;
135 }
136
137 /// Return the `normalized_text` of this token if present and `text` if not as a `str`.
138 pub fn get_text_prefer_normalized(&self) -> &str {
139 if let Some(normalized_text) = &self.normalized_text {
140 return normalized_text.as_str();
141 } else {
142 return self.text;
143 }
144 }
145
146 /// This is the same as [get_text_prefer_normalized()][Self::get_text_prefer_normalized], but returns an owned String instead.
147 pub fn get_text_prefer_normalized_owned(&self) -> String {
148 self.get_text_prefer_normalized().to_string()
149 }
150}
151
152impl<'a> From<&'a String> for SegmentedToken<'a> {
153 fn from(value: &'a String) -> Self {
154 Self::new(value, None)
155 }
156}
157
158impl<'a> From<&'a str> for SegmentedToken<'a> {
159 fn from(value: &'a str) -> Self {
160 Self::new(value, None)
161 }
162}
163
164/// What kind of content to expect from a [SegmentedToken].
165#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
166pub enum SegmentedTokenKind {
167 /// The token is a collection of any kind of letters and numbers
168 AlphaNumeric,
169
170 /// The token is some kind of seperator
171 Separator,
172
173 /// The token represents a symbol
174 Symbol,
175}