lindera/token.rs
1use std::borrow::Cow;
2
3use lindera_dictionary::dictionary::UNK;
4use serde_json::{Value, json};
5
6use crate::dictionary::{Dictionary, UserDictionary, WordId};
7
8#[derive(Clone)]
9pub struct Token<'a> {
10 /// The text content of the token, which is a copy-on-write string slice.
11 /// This allows for efficient handling of both owned and borrowed string data.
12 pub surface: Cow<'a, str>,
13
14 /// The starting byte position of the token in the original text.
15 /// This indicates where the token begins in the input string.
16 pub byte_start: usize,
17
18 /// The ending byte position of the token in the original text.
19 /// This indicates the position immediately after the last byte of the token.
20 pub byte_end: usize,
21
22 /// This field represents the starting byte position of the token within the original input text.
23 /// It is useful for mapping the token back to its location in the input.
24 pub position: usize,
25
26 /// The length of the token's position in the text.
27 /// This indicates how many characters the token spans.
28 pub position_length: usize,
29
30 /// The identifier for the word, used to uniquely distinguish it within the context of the application.
31 pub word_id: WordId,
32
33 /// A reference to the dictionary used for tokenization.
34 ///
35 /// The dictionary contains the data necessary for the tokenization process,
36 /// including word entries and their associated metadata. This reference
37 /// allows the tokenizer to access and utilize the dictionary during
38 /// the tokenization of input text.
39 pub dictionary: &'a Dictionary,
40
41 /// An optional reference to a user-defined dictionary.
42 ///
43 /// This dictionary can be used to add custom words or override existing words
44 /// in the default dictionary. If `None`, the default dictionary is used.
45 pub user_dictionary: Option<&'a UserDictionary>,
46
47 /// An optional vector containing detailed information about the token.
48 /// Each element in the vector is a `Cow` (Copy-On-Write) type, which allows
49 /// for efficient handling of both owned and borrowed string data.
50 ///
51 /// # Note
52 ///
53 /// This field is optional and may be `None` if no detailed information is available.
54 pub details: Option<Vec<Cow<'a, str>>>,
55}
56
57impl<'a> Token<'a> {
58 /// Creates a new `Token` instance with the provided parameters.
59 ///
60 /// # Arguments
61 ///
62 /// * `text` - A `Cow<'a, str>` representing the text of the token. This can be either a borrowed or owned string.
63 /// * `start` - The byte position where the token starts in the original text.
64 /// * `end` - The byte position where the token ends in the original text.
65 /// * `position` - The position of the token in the sequence of tokens (usually an index).
66 /// * `word_id` - The `WordId` associated with the token, identifying the token in the dictionary.
67 /// * `dictionary` - A reference to the `Dictionary` that contains information about the token.
68 /// * `user_dictionary` - An optional reference to a `UserDictionary`, which may provide additional user-defined tokens.
69 ///
70 /// # Returns
71 ///
72 /// Returns a new `Token` instance initialized with the provided values.
73 ///
74 /// # Details
75 ///
76 /// - The token's `text` can be a borrowed reference or an owned string, thanks to the use of `Cow<'a, str>`.
77 /// - `byte_start` and `byte_end` are used to define the token's byte offset within the original text.
78 /// - `position` marks the token's place in the overall tokenized sequence.
79 /// - `position_length` is set to `1` by default.
80 /// - `word_id` is used to identify the token in the dictionary, and the dictionaries (both `dictionary` and `user_dictionary`) provide additional details about the token.
81 pub fn new(
82 surface: Cow<'a, str>,
83 start: usize,
84 end: usize,
85 position: usize,
86 word_id: WordId,
87 dictionary: &'a Dictionary,
88 user_dictionary: Option<&'a UserDictionary>,
89 ) -> Self {
90 Self {
91 surface,
92 byte_start: start,
93 byte_end: end,
94 position,
95 position_length: 1,
96 word_id,
97 dictionary,
98 user_dictionary,
99 details: None,
100 }
101 }
102
103 /// Retrieves the details of the token, either from the dictionary or the user-defined dictionary.
104 ///
105 /// # Returns
106 ///
107 /// Returns a `Vec<&str>` containing the token's details. These details are typically part-of-speech information or other metadata about the token.
108 ///
109 /// # Process
110 ///
111 /// 1. **Check if details are already set**:
112 /// - If `self.details` is `None`, the method will attempt to fetch the details from either the system dictionary or the user dictionary.
113 /// - If the `word_id` is unknown, a default value `UNK` is returned.
114 /// 2. **Fetch details from dictionaries**:
115 /// - If the `word_id` corresponds to a system dictionary entry, details are fetched from `self.dictionary`.
116 /// - If the `word_id` corresponds to a user-defined dictionary, details are fetched from `self.user_dictionary`.
117 /// 3. **Store details**:
118 /// - The fetched details are stored in `self.details` as `Some(Vec<Cow<str>>)` to avoid recalculating them in subsequent calls.
119 /// 4. **Return details as `&str`**:
120 /// - The `Cow<str>` values stored in `self.details` are converted to `&str` and returned.
121 ///
122 /// # Notes
123 ///
124 /// - The first time this method is called, it fetches the details from the dictionary (or user dictionary), but on subsequent calls, it returns the cached details in `self.details`.
125 /// - If the token is unknown and no details can be retrieved, a default value (`UNK`) is used.
126 pub fn details(&mut self) -> Vec<&str> {
127 // Ensure details are initialized
128 self.ensure_details();
129
130 // Fast path: return references without allocation
131 match &self.details {
132 Some(details) => details.iter().map(|x| x.as_ref()).collect(),
133 None => UNK.to_vec(), // Fallback, should not happen after ensure_details()
134 }
135 }
136
137 /// Helper method to ensure details are loaded without returning them
138 fn ensure_details(&mut self) {
139 if self.details.is_none() {
140 let tmp = if self.word_id.is_unknown() {
141 UNK.to_vec()
142 } else if self.word_id.is_system() {
143 self.dictionary.word_details(self.word_id.id as usize)
144 } else {
145 match self.user_dictionary {
146 Some(user_dictionary) => user_dictionary.word_details(self.word_id.id as usize),
147 None => UNK.to_vec(),
148 }
149 };
150
151 self.details = Some(tmp.into_iter().map(Cow::Borrowed).collect());
152 }
153 }
154
155 /// Retrieves the token's detail at the specified index, if available.
156 ///
157 /// # Arguments
158 ///
159 /// * `index` - The index of the detail to retrieve.
160 ///
161 /// # Returns
162 ///
163 /// Returns an `Option<&str>` that contains the detail at the specified index.
164 /// If the index is out of bounds or no details are available, `None` is returned.
165 ///
166 /// # Details
167 ///
168 /// - This method first ensures that the token's details are populated by calling `self.details()`.
169 /// - If details are available and the provided index is valid, the detail at the specified index is returned as `Some(&str)`.
170 /// - If the index is out of range, `None` is returned.
171 pub fn get_detail(&mut self, index: usize) -> Option<&str> {
172 self.details().get(index).copied()
173 }
174
175 /// Sets the token's detail at the specified index with the provided value.
176 ///
177 /// # Arguments
178 ///
179 /// * `index` - The index of the detail to set. This specifies which detail to update.
180 /// * `detail` - A `Cow<'a, str>` representing the new detail value to set. It can either be a borrowed or owned string.
181 ///
182 /// # Details
183 ///
184 /// - If the token's details have already been populated (`self.details` is `Some`), this method updates the detail at the specified index.
185 /// - If the provided index is valid (within bounds of the `details` vector), the detail at that index is replaced by the new `detail` value.
186 /// - If the details have not been set (`self.details` is `None`), this method does nothing.
187 /// - This method does not handle index out-of-bounds errors explicitly, so it assumes that the index provided is valid.
188 ///
189 /// # Notes
190 ///
191 /// - The `Cow<'a, str>` type allows flexibility, as it can handle either borrowed or owned strings.
192 /// - This method does not initialize the details if they are not already set. To ensure the details are set, `details()` can be called prior to calling this method.
193 pub fn set_detail(&mut self, index: usize, detail: Cow<'a, str>) {
194 if let Some(details) = self.details.as_mut() {
195 details[index] = detail;
196 }
197 }
198
199 /// Retrieves the token's detail by field name.
200 ///
201 /// # Arguments
202 ///
203 /// * `field_name` - The name of the field to retrieve.
204 ///
205 /// # Returns
206 ///
207 /// Returns an `Option<&str>` containing the value of the specified field.
208 /// If the field name is not found or the schema is not available, `None` is returned.
209 ///
210 /// # Example
211 ///
212 /// ```no_run
213 /// # use lindera::token::Token;
214 /// # let mut token: Token = unimplemented!();
215 /// let base_form = token.get("base_form");
216 /// let pos = token.get("major_pos");
217 /// ```
218 pub fn get(&mut self, field_name: &str) -> Option<&str> {
219 // Get field index from schema
220 let index = self
221 .dictionary
222 .metadata
223 .dictionary_schema
224 .get_field_index(field_name)?;
225
226 // Handle common fields
227 match index {
228 0 => Some(self.surface.as_ref()), // surface
229 1..=3 => None, // left_context_id, right_context_id, cost are not stored in token
230 _ => {
231 // For custom fields (index >= 4), get from details
232 // details array doesn't include the first 4 common fields
233 self.get_detail(index - 4)
234 }
235 }
236 }
237
238 /// Returns all token fields as a JSON Value.
239 ///
240 /// # Returns
241 ///
242 /// Returns a `serde_json::Value` containing all available fields and their values.
243 /// Numeric fields (byte_start, byte_end, word_id) are represented as numbers,
244 /// while text fields remain as strings.
245 ///
246 /// # Example
247 ///
248 /// ```no_run
249 /// # use lindera::token::Token;
250 /// # let mut token: Token = unimplemented!();
251 /// let value = token.as_value();
252 /// println!("Surface: {}", value["surface"]);
253 /// println!("Byte start: {}", value["byte_start"]); // This is a number
254 /// println!("Word ID: {}", value["word_id"]); // This is a number
255 /// ```
256 pub fn as_value(&mut self) -> Value {
257 // Get schema info first
258 let schema_custom_fields = self
259 .dictionary
260 .metadata
261 .dictionary_schema
262 .get_custom_fields();
263
264 // Copy values before mutable borrow
265 let surface = self.surface.to_string();
266 let byte_start = self.byte_start;
267 let byte_end = self.byte_end;
268 let word_id = self.word_id.id;
269
270 // Get details (requires mutable borrow)
271 let details = self.details();
272
273 // Build JSON object
274 let mut obj = serde_json::Map::new();
275
276 // Add surface as string
277 obj.insert("surface".to_string(), json!(surface));
278
279 // Add byte positions as numbers
280 obj.insert("byte_start".to_string(), json!(byte_start));
281 obj.insert("byte_end".to_string(), json!(byte_end));
282
283 // Add word_id as number
284 obj.insert("word_id".to_string(), json!(word_id));
285
286 // Add each custom field from the schema
287 for (i, field_name) in schema_custom_fields.iter().enumerate() {
288 if let Some(value) = details.get(i) {
289 // Try to parse as number if possible, otherwise keep as string
290 if let Ok(num) = value.parse::<i64>() {
291 obj.insert(field_name.to_string(), json!(num));
292 } else if let Ok(num) = value.parse::<f64>() {
293 obj.insert(field_name.to_string(), json!(num));
294 } else {
295 obj.insert(field_name.to_string(), json!(*value));
296 }
297 }
298 }
299
300 Value::Object(obj)
301 }
302}