unobtanium_segmenter/
segmented_token.rs

1use whatlang::Lang;
2use whatlang::Script;
3
4/// The main representation of data this crate works on.
5///
6/// A token is effectively `text` with metadata attached, this struct being the metadata carrier.
7#[derive(Debug, Clone, PartialEq)]
8pub struct SegmentedToken<'a> {
9	/// The piece of text that this token represents.
10	///
11	/// This should be borrowed from the initial text that was fed to the segmenter chain.
12	pub text: &'a str,
13
14	/// If a [normalizer](crate::normalization) output was different from `text` the result will be stored in here.
15	///
16	/// It is recommended that you fetch normalized text using [get_text_prefer_normalized()][Self::get_text_prefer_normalized] or [get_text_prefer_normalized_owned()][Self::get_text_prefer_normalized_owned].
17	pub normalized_text: Option<String>,
18
19	/// What kind of token this is.
20	///
21	/// Set by:
22	/// * [NaiveWordSplitter][crate::segmentation::NaiveWordSplitter]
23	/// * [AugmentationClassify][crate::augmentation::AugmentationClassify]
24	pub kind: Option<SegmentedTokenKind>,
25
26	/// The primary script as detected by a script or language detection augmenter.
27	///
28	/// Information about detected scripts is inherited across splitting.
29	pub detected_script: Option<Script>,
30
31	/// The primary language detected by a language detection augmenter.
32	///
33	/// Information about detected languages in inherited across splitting.
34	pub detected_language: Option<Lang>,
35
36	/// How confident the language detector was about the language that it detectd.
37	///
38	/// This scales inbetween `0` (not confident at all) and `1` (most confident).
39	pub detected_language_confidence: f64,
40
41	/// Wheter the language detector considers its output to be reliable.
42	pub is_detected_language_relible: bool,
43
44	/// Indicates that no further splitting is neccessary.
45	///
46	/// This should be set to true if the token was a valid word in a dictionary.
47	pub is_known_word: bool,
48
49	/// Indicates that this token marks the end of a sentence.
50	///
51	/// This should only be set on tokens with an empty `text` field. It is not inherited.
52	pub is_end_of_sentence: bool,
53	//TODO: Add useful flags here
54	// contains_emoji: bool,
55}
56
57impl<'a> SegmentedToken<'a> {
58	/// Create a segmented token from scratch. (You likely won't need it)
59	///
60	/// If you are wwriting a segmenter have a look at [new_derived_from()][Self::new_derived_from].
61	///
62	/// For creating the initial token consider usng the `From` implementations or the [StartSegmentationChain][crate::chain::StartSegmentationChain] trait.
63	pub fn new(text: &'a str, kind: Option<SegmentedTokenKind>) -> Self {
64		let mut out = Self {
65			text,
66			kind,
67			normalized_text: None,
68			is_known_word: false,
69			detected_script: None,
70			detected_language: None,
71			detected_language_confidence: 0.0,
72			is_detected_language_relible: false,
73			is_end_of_sentence: false,
74		};
75		match kind {
76			Some(SegmentedTokenKind::AlphaNumeric) => {
77				out.detected_script = whatlang::detect_script(text);
78			}
79			_ => { /* Do nothing */ }
80		}
81		return out;
82	}
83
84	/// Create a token with a given text that inerits metadata from the `from` token.
85	///
86	/// This is the recommended constructor to use inside a segmenter after splitting.
87	pub fn new_derived_from(text: &'a str, from: &Self) -> Self {
88		Self {
89			text,
90			kind: None,
91			is_known_word: from.is_known_word,
92			detected_script: from.detected_script,
93			detected_language: from.detected_language,
94			detected_language_confidence: from.detected_language_confidence,
95			is_detected_language_relible: from.is_detected_language_relible,
96			normalized_text: None,
97			is_end_of_sentence: false,
98		}
99	}
100
101	/// Create a new token that carries an `is_end_of_sentence` marker.
102	///
103	/// Recommended way of deriving the empty text:
104	/// ```rust
105	/// # use unobtanium_segmenter::SegmentedToken;
106	/// # let token = SegmentedToken::new("Some example sentence to segment.", None);
107	/// # let sentence = token.text; // Actual segmenter goes here
108	/// let (main, tail) = sentence.split_at(sentence.len());
109	/// SegmentedToken::new_derived_from(main, &token);
110	/// SegmentedToken::new_end_of_sentence(tail);
111	/// ```
112	pub fn new_end_of_sentence(empty_text: &'a str) -> Self {
113		let mut new = Self::new(empty_text, None);
114		new.is_end_of_sentence = true;
115		return new;
116	}
117
118	/// Helper function to convert texts that came ot of a simple helper function
119	/// back into segments.
120	///
121	/// Using this implies that further segmenting didn't change anything
122	/// for the metadta of the child segments.
123	pub fn covert_to_child_segements_of_self(
124		&'a self,
125		texts: &'a [&'a str],
126	) -> impl Iterator<Item = SegmentedToken<'a>> + 'a {
127		texts.iter().map(|text| Self::new_derived_from(text, self))
128	}
129
130	/// Builder like convenience function to set the `is_known_word` flag.
131	#[inline(always)]
132	pub fn with_is_kown_word(mut self, is_known_word: bool) -> Self {
133		self.is_known_word = is_known_word;
134		return self;
135	}
136
137	/// Return the `normalized_text` of this token if present and `text` if not as a `str`.
138	pub fn get_text_prefer_normalized(&self) -> &str {
139		if let Some(normalized_text) = &self.normalized_text {
140			return normalized_text.as_str();
141		} else {
142			return self.text;
143		}
144	}
145
146	/// This is the same as [get_text_prefer_normalized()][Self::get_text_prefer_normalized], but returns an owned String instead.
147	pub fn get_text_prefer_normalized_owned(&self) -> String {
148		self.get_text_prefer_normalized().to_string()
149	}
150}
151
152impl<'a> From<&'a String> for SegmentedToken<'a> {
153	fn from(value: &'a String) -> Self {
154		Self::new(value, None)
155	}
156}
157
158impl<'a> From<&'a str> for SegmentedToken<'a> {
159	fn from(value: &'a str) -> Self {
160		Self::new(value, None)
161	}
162}
163
164/// What kind of content to expect from a [SegmentedToken].
165#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
166pub enum SegmentedTokenKind {
167	/// The token is a collection of any kind of letters and numbers
168	AlphaNumeric,
169
170	/// The token is some kind of seperator
171	Separator,
172
173	/// The token represents a symbol
174	Symbol,
175}