unobtanium_segmenter/segmented_token.rs
1use whatlang::Lang;
2use whatlang::Script;
3
4/// The main representation of data this crate works on.
5///
6/// A token is effectively `text` with metadata attached, this struct being the metadata carrier.
7#[derive(Debug)]
8pub struct SegmentedToken<'a> {
9 /// The piece of text that this token represents.
10 ///
11 /// This should be borrowed from the initial text that was fed to the segmenter chain.
12 pub text: &'a str,
13
14 /// If a normalizers output was different from `text` the result will be stored in here.
15 ///
16 /// It is recommended that you fetch normalized text using [get_text_prefer_normalized()][Self::get_text_prefer_normalized] or [get_text_prefer_normalized_owned()][Self::get_text_prefer_normalized_owned].
17 pub normalized_text: Option<String>,
18
19 /// What kind of token this is, currently only set by the [NaiveWordSplitter][crate::segmentation::NaiveWordSplitter].
20 pub kind: Option<SegmentedTokenKind>,
21
22 /// The primary script as detected by a script or language detection augmenter.
23 ///
24 /// Information about detected scripts is inherited across splitting.
25 pub detected_script: Option<Script>,
26
27 /// The primary language detected by a language detection augmenter.
28 ///
29 /// Information about detected languages in inherited across splitting.
30 pub detected_language: Option<Lang>,
31
32 /// How confident the language detector was about the language that it detectd.
33 ///
34 /// This scales inbetween `0` (not confident at all) and `1` (most confident).
35 pub detected_language_confidence: f64,
36
37 /// Wheter the language detector considers its output to be reliable.
38 pub is_detected_language_relible: bool,
39
40 /// Indicates that no further splitting is neccessary.
41 ///
42 /// This should be set to true if the token was a valid word in a dictionary.
43 pub is_known_word: bool,
44 //TODO: Add useful flags here
45 // contains_emoji: bool,
46}
47
48impl<'a> SegmentedToken<'a> {
49 /// Create a segmented token from scratch. (You likely won't need it)
50 ///
51 /// If you are wwriting a segmenter have a look at [new_derived_from()][Self::new_derived_from].
52 ///
53 /// For creating the initial token consider usng the `From` implementations or the [StartSegmentationChain][crate::chain::StartSegmentationChain] trait.
54 pub fn new(text: &'a str, kind: Option<SegmentedTokenKind>) -> Self {
55 let mut out = Self {
56 text,
57 kind,
58 normalized_text: None,
59 is_known_word: false,
60 detected_script: None,
61 detected_language: None,
62 detected_language_confidence: 0.0,
63 is_detected_language_relible: false,
64 };
65 match kind {
66 Some(SegmentedTokenKind::AlphaNumeric) => {
67 out.detected_script = whatlang::detect_script(text);
68 }
69 _ => { /* Do nothing */ }
70 }
71 return out;
72 }
73
74 /// Create a token with a given text that inerits metadata from the `from` token.
75 ///
76 /// This is the recommended constructor to use inside a segmenter after splitting.
77 pub fn new_derived_from(text: &'a str, from: &Self) -> Self {
78 Self {
79 text,
80 kind: from.kind,
81 is_known_word: from.is_known_word,
82 detected_script: from.detected_script,
83 detected_language: from.detected_language,
84 detected_language_confidence: from.detected_language_confidence,
85 is_detected_language_relible: from.is_detected_language_relible,
86 normalized_text: None,
87 }
88 }
89
90 /// Helper function to convert texts that came ot of a simple helper function
91 /// back into segments.
92 ///
93 /// Using this implies that further segmenting didn't change anything
94 /// for the metadta of the child segments.
95 pub fn covert_to_child_segements_of_self(
96 &'a self,
97 texts: &'a [&'a str],
98 ) -> impl Iterator<Item = SegmentedToken<'a>> + 'a {
99 texts.iter().map(|text| Self::new_derived_from(text, self))
100 }
101
102 /// Builder like convenience function to set the `is_known_word` flag.
103 #[inline(always)]
104 pub fn with_is_kown_word(mut self, is_known_word: bool) -> Self {
105 self.is_known_word = is_known_word;
106 return self;
107 }
108
109 /// Return the `normalized_text` of this token if present and `text` if not as a `str`.
110 pub fn get_text_prefer_normalized(&self) -> &str {
111 if let Some(normalized_text) = &self.normalized_text {
112 return normalized_text.as_str();
113 } else {
114 return self.text;
115 }
116 }
117
118 /// This is the same as [get_text_prefer_normalized()][Self::get_text_prefer_normalized], but returns an owned String instead.
119 pub fn get_text_prefer_normalized_owned(&self) -> String {
120 self.get_text_prefer_normalized().to_string()
121 }
122}
123
124impl<'a> From<&'a String> for SegmentedToken<'a> {
125 fn from(value: &'a String) -> Self {
126 Self::new(value, None)
127 }
128}
129
130impl<'a> From<&'a str> for SegmentedToken<'a> {
131 fn from(value: &'a str) -> Self {
132 Self::new(value, None)
133 }
134}
135
136/// What kind of content to expect from a [SegmentedToken].
137#[derive(Debug, Clone, Copy)]
138pub enum SegmentedTokenKind {
139 /// The token is a collection of any kind of letters and numbers
140 AlphaNumeric,
141
142 /// The token is some kind of seperator
143 Separator,
144
145 /// The token represents a symbol
146 Symbol,
147 // TODO:
148 // (word|subword|light-seperator|heavy-seperator)
149}