unobtanium_segmenter/
segmented_token.rs

1use whatlang::Lang;
2use whatlang::Script;
3
4/// The main representation of data this crate works on.
5///
6/// A token is effectively `text` with metadata attached, this struct being the metadata carrier.
7#[derive(Debug)]
8pub struct SegmentedToken<'a> {
9	/// The piece of text that this token represents.
10	///
11	/// This should be borrowed from the initial text that was fed to the segmenter chain.
12	pub text: &'a str,
13
14	/// If a normalizers output was different from `text` the result will be stored in here.
15	///
16	/// It is recommended that you fetch normalized text using [get_text_prefer_normalized()][Self::get_text_prefer_normalized] or [get_text_prefer_normalized_owned()][Self::get_text_prefer_normalized_owned].
17	pub normalized_text: Option<String>,
18
19	/// What kind of token this is, currently only set by the [NaiveWordSplitter][crate::segmentation::NaiveWordSplitter].
20	pub kind: Option<SegmentedTokenKind>,
21
22	/// The primary script as detected by a script or language detection augmenter.
23	///
24	/// Information about detected scripts is inherited across splitting.
25	pub detected_script: Option<Script>,
26
27	/// The primary language detected by a language detection augmenter.
28	///
29	/// Information about detected languages in inherited across splitting.
30	pub detected_language: Option<Lang>,
31
32	/// How confident the language detector was about the language that it detectd.
33	///
34	/// This scales inbetween `0` (not confident at all) and `1` (most confident).
35	pub detected_language_confidence: f64,
36
37	/// Wheter the language detector considers its output to be reliable.
38	pub is_detected_language_relible: bool,
39
40	/// Indicates that no further splitting is neccessary.
41	///
42	/// This should be set to true if the token was a valid word in a dictionary.
43	pub is_known_word: bool,
44	//TODO: Add useful flags here
45	// contains_emoji: bool,
46}
47
48impl<'a> SegmentedToken<'a> {
49	/// Create a segmented token from scratch. (You likely won't need it)
50	///
51	/// If you are wwriting a segmenter have a look at [new_derived_from()][Self::new_derived_from].
52	///
53	/// For creating the initial token consider usng the `From` implementations or the [StartSegmentationChain][crate::chain::StartSegmentationChain] trait.
54	pub fn new(text: &'a str, kind: Option<SegmentedTokenKind>) -> Self {
55		let mut out = Self {
56			text,
57			kind,
58			normalized_text: None,
59			is_known_word: false,
60			detected_script: None,
61			detected_language: None,
62			detected_language_confidence: 0.0,
63			is_detected_language_relible: false,
64		};
65		match kind {
66			Some(SegmentedTokenKind::AlphaNumeric) => {
67				out.detected_script = whatlang::detect_script(text);
68			}
69			_ => { /* Do nothing */ }
70		}
71		return out;
72	}
73
74	/// Create a token with a given text that inerits metadata from the `from` token.
75	///
76	/// This is the recommended constructor to use inside a segmenter after splitting.
77	pub fn new_derived_from(text: &'a str, from: &Self) -> Self {
78		Self {
79			text,
80			kind: from.kind,
81			is_known_word: from.is_known_word,
82			detected_script: from.detected_script,
83			detected_language: from.detected_language,
84			detected_language_confidence: from.detected_language_confidence,
85			is_detected_language_relible: from.is_detected_language_relible,
86			normalized_text: None,
87		}
88	}
89
90	/// Helper function to convert texts that came ot of a simple helper function
91	/// back into segments.
92	///
93	/// Using this implies that further segmenting didn't change anything
94	/// for the metadta of the child segments.
95	pub fn covert_to_child_segements_of_self(
96		&'a self,
97		texts: &'a [&'a str],
98	) -> impl Iterator<Item = SegmentedToken<'a>> + 'a {
99		texts.iter().map(|text| Self::new_derived_from(text, self))
100	}
101
102	/// Builder like convenience function to set the `is_known_word` flag.
103	#[inline(always)]
104	pub fn with_is_kown_word(mut self, is_known_word: bool) -> Self {
105		self.is_known_word = is_known_word;
106		return self;
107	}
108
109	/// Return the `normalized_text` of this token if present and `text` if not as a `str`.
110	pub fn get_text_prefer_normalized(&self) -> &str {
111		if let Some(normalized_text) = &self.normalized_text {
112			return normalized_text.as_str();
113		} else {
114			return self.text;
115		}
116	}
117
118	/// This is the same as [get_text_prefer_normalized()][Self::get_text_prefer_normalized], but returns an owned String instead.
119	pub fn get_text_prefer_normalized_owned(&self) -> String {
120		self.get_text_prefer_normalized().to_string()
121	}
122}
123
124impl<'a> From<&'a String> for SegmentedToken<'a> {
125	fn from(value: &'a String) -> Self {
126		Self::new(value, None)
127	}
128}
129
130impl<'a> From<&'a str> for SegmentedToken<'a> {
131	fn from(value: &'a str) -> Self {
132		Self::new(value, None)
133	}
134}
135
136/// What kind of content to expect from a [SegmentedToken].
137#[derive(Debug, Clone, Copy)]
138pub enum SegmentedTokenKind {
139	/// The token is a collection of any kind of letters and numbers
140	AlphaNumeric,
141
142	/// The token is some kind of seperator
143	Separator,
144
145	/// The token represents a symbol
146	Symbol,
147	// TODO:
148	// (word|subword|light-seperator|heavy-seperator)
149}