Skip to main content

lindera/
token.rs

1use std::borrow::Cow;
2
3use lindera_dictionary::dictionary::UNK;
4use serde_json::{Value, json};
5
6use crate::dictionary::{Dictionary, UserDictionary, WordId};
7
8#[derive(Clone)]
9pub struct Token<'a> {
10    /// The text content of the token, which is a copy-on-write string slice.
11    /// This allows for efficient handling of both owned and borrowed string data.
12    pub surface: Cow<'a, str>,
13
14    /// The starting byte position of the token in the original text.
15    /// This indicates where the token begins in the input string.
16    pub byte_start: usize,
17
18    /// The ending byte position of the token in the original text.
19    /// This indicates the position immediately after the last byte of the token.
20    pub byte_end: usize,
21
22    /// This field represents the starting byte position of the token within the original input text.
23    /// It is useful for mapping the token back to its location in the input.
24    pub position: usize,
25
26    /// The length of the token's position in the text.
27    /// This indicates how many characters the token spans.
28    pub position_length: usize,
29
30    /// The identifier for the word, used to uniquely distinguish it within the context of the application.
31    pub word_id: WordId,
32
33    /// A reference to the dictionary used for tokenization.
34    ///
35    /// The dictionary contains the data necessary for the tokenization process,
36    /// including word entries and their associated metadata. This reference
37    /// allows the tokenizer to access and utilize the dictionary during
38    /// the tokenization of input text.
39    pub dictionary: &'a Dictionary,
40
41    /// An optional reference to a user-defined dictionary.
42    ///
43    /// This dictionary can be used to add custom words or override existing words
44    /// in the default dictionary. If `None`, the default dictionary is used.
45    pub user_dictionary: Option<&'a UserDictionary>,
46
47    /// An optional vector containing detailed information about the token.
48    /// Each element in the vector is a `Cow` (Copy-On-Write) type, which allows
49    /// for efficient handling of both owned and borrowed string data.
50    ///
51    /// # Note
52    ///
53    /// This field is optional and may be `None` if no detailed information is available.
54    pub details: Option<Vec<Cow<'a, str>>>,
55}
56
57impl<'a> Token<'a> {
58    /// Creates a new `Token` instance with the provided parameters.
59    ///
60    /// # Arguments
61    ///
62    /// * `text` - A `Cow<'a, str>` representing the text of the token. This can be either a borrowed or owned string.
63    /// * `start` - The byte position where the token starts in the original text.
64    /// * `end` - The byte position where the token ends in the original text.
65    /// * `position` - The position of the token in the sequence of tokens (usually an index).
66    /// * `word_id` - The `WordId` associated with the token, identifying the token in the dictionary.
67    /// * `dictionary` - A reference to the `Dictionary` that contains information about the token.
68    /// * `user_dictionary` - An optional reference to a `UserDictionary`, which may provide additional user-defined tokens.
69    ///
70    /// # Returns
71    ///
72    /// Returns a new `Token` instance initialized with the provided values.
73    ///
74    /// # Details
75    ///
76    /// - The token's `text` can be a borrowed reference or an owned string, thanks to the use of `Cow<'a, str>`.
77    /// - `byte_start` and `byte_end` are used to define the token's byte offset within the original text.
78    /// - `position` marks the token's place in the overall tokenized sequence.
79    /// - `position_length` is set to `1` by default.
80    /// - `word_id` is used to identify the token in the dictionary, and the dictionaries (both `dictionary` and `user_dictionary`) provide additional details about the token.
81    pub fn new(
82        surface: Cow<'a, str>,
83        start: usize,
84        end: usize,
85        position: usize,
86        word_id: WordId,
87        dictionary: &'a Dictionary,
88        user_dictionary: Option<&'a UserDictionary>,
89    ) -> Self {
90        Self {
91            surface,
92            byte_start: start,
93            byte_end: end,
94            position,
95            position_length: 1,
96            word_id,
97            dictionary,
98            user_dictionary,
99            details: None,
100        }
101    }
102
103    /// Retrieves the details of the token, either from the dictionary or the user-defined dictionary.
104    ///
105    /// # Returns
106    ///
107    /// Returns a `Vec<&str>` containing the token's details. These details are typically part-of-speech information or other metadata about the token.
108    ///
109    /// # Process
110    ///
111    /// 1. **Check if details are already set**:
112    ///    - If `self.details` is `None`, the method will attempt to fetch the details from either the system dictionary or the user dictionary.
113    ///    - If the `word_id` is unknown, a default value `UNK` is returned.
114    /// 2. **Fetch details from dictionaries**:
115    ///    - If the `word_id` corresponds to a system dictionary entry, details are fetched from `self.dictionary`.
116    ///    - If the `word_id` corresponds to a user-defined dictionary, details are fetched from `self.user_dictionary`.
117    /// 3. **Store details**:
118    ///    - The fetched details are stored in `self.details` as `Some(Vec<Cow<str>>)` to avoid recalculating them in subsequent calls.
119    /// 4. **Return details as `&str`**:
120    ///    - The `Cow<str>` values stored in `self.details` are converted to `&str` and returned.
121    ///
122    /// # Notes
123    ///
124    /// - The first time this method is called, it fetches the details from the dictionary (or user dictionary), but on subsequent calls, it returns the cached details in `self.details`.
125    /// - If the token is unknown and no details can be retrieved, a default value (`UNK`) is used.
126    pub fn details(&mut self) -> Vec<&str> {
127        // Ensure details are initialized
128        self.ensure_details();
129
130        // Fast path: return references without allocation
131        match &self.details {
132            Some(details) => details.iter().map(|x| x.as_ref()).collect(),
133            None => UNK.to_vec(), // Fallback, should not happen after ensure_details()
134        }
135    }
136
137    /// Helper method to ensure details are loaded without returning them
138    fn ensure_details(&mut self) {
139        if self.details.is_none() {
140            let tmp = if self.word_id.is_unknown() {
141                UNK.to_vec()
142            } else if self.word_id.is_system() {
143                self.dictionary.word_details(self.word_id.id as usize)
144            } else {
145                match self.user_dictionary {
146                    Some(user_dictionary) => user_dictionary.word_details(self.word_id.id as usize),
147                    None => UNK.to_vec(),
148                }
149            };
150
151            self.details = Some(tmp.into_iter().map(Cow::Borrowed).collect());
152        }
153    }
154
155    /// Retrieves the token's detail at the specified index, if available.
156    ///
157    /// # Arguments
158    ///
159    /// * `index` - The index of the detail to retrieve.
160    ///
161    /// # Returns
162    ///
163    /// Returns an `Option<&str>` that contains the detail at the specified index.
164    /// If the index is out of bounds or no details are available, `None` is returned.
165    ///
166    /// # Details
167    ///
168    /// - This method first ensures that the token's details are populated by calling `self.details()`.
169    /// - If details are available and the provided index is valid, the detail at the specified index is returned as `Some(&str)`.
170    /// - If the index is out of range, `None` is returned.
171    pub fn get_detail(&mut self, index: usize) -> Option<&str> {
172        self.details().get(index).copied()
173    }
174
175    /// Sets the token's detail at the specified index with the provided value.
176    ///
177    /// # Arguments
178    ///
179    /// * `index` - The index of the detail to set. This specifies which detail to update.
180    /// * `detail` - A `Cow<'a, str>` representing the new detail value to set. It can either be a borrowed or owned string.
181    ///
182    /// # Details
183    ///
184    /// - If the token's details have already been populated (`self.details` is `Some`), this method updates the detail at the specified index.
185    /// - If the provided index is valid (within bounds of the `details` vector), the detail at that index is replaced by the new `detail` value.
186    /// - If the details have not been set (`self.details` is `None`), this method does nothing.
187    /// - This method does not handle index out-of-bounds errors explicitly, so it assumes that the index provided is valid.
188    ///
189    /// # Notes
190    ///
191    /// - The `Cow<'a, str>` type allows flexibility, as it can handle either borrowed or owned strings.
192    /// - This method does not initialize the details if they are not already set. To ensure the details are set, `details()` can be called prior to calling this method.
193    pub fn set_detail(&mut self, index: usize, detail: Cow<'a, str>) {
194        if let Some(details) = self.details.as_mut() {
195            details[index] = detail;
196        }
197    }
198
199    /// Retrieves the token's detail by field name.
200    ///
201    /// # Arguments
202    ///
203    /// * `field_name` - The name of the field to retrieve.
204    ///
205    /// # Returns
206    ///
207    /// Returns an `Option<&str>` containing the value of the specified field.
208    /// If the field name is not found or the schema is not available, `None` is returned.
209    ///
210    /// # Example
211    ///
212    /// ```no_run
213    /// # use lindera::token::Token;
214    /// # let mut token: Token = unimplemented!();
215    /// let base_form = token.get("base_form");
216    /// let pos = token.get("major_pos");
217    /// ```
218    pub fn get(&mut self, field_name: &str) -> Option<&str> {
219        // Get field index from schema
220        let index = self
221            .dictionary
222            .metadata
223            .dictionary_schema
224            .get_field_index(field_name)?;
225
226        // Handle common fields
227        match index {
228            0 => Some(self.surface.as_ref()), // surface
229            1..=3 => None, // left_context_id, right_context_id, cost are not stored in token
230            _ => {
231                // For custom fields (index >= 4), get from details
232                // details array doesn't include the first 4 common fields
233                self.get_detail(index - 4)
234            }
235        }
236    }
237
238    /// Returns all token fields as a JSON Value.
239    ///
240    /// # Returns
241    ///
242    /// Returns a `serde_json::Value` containing all available fields and their values.
243    /// Numeric fields (byte_start, byte_end, word_id) are represented as numbers,
244    /// while text fields remain as strings.
245    ///
246    /// # Example
247    ///
248    /// ```no_run
249    /// # use lindera::token::Token;
250    /// # let mut token: Token = unimplemented!();
251    /// let value = token.as_value();
252    /// println!("Surface: {}", value["surface"]);
253    /// println!("Byte start: {}", value["byte_start"]); // This is a number
254    /// println!("Word ID: {}", value["word_id"]); // This is a number
255    /// ```
256    pub fn as_value(&mut self) -> Value {
257        // Get schema info first
258        let schema_custom_fields = self
259            .dictionary
260            .metadata
261            .dictionary_schema
262            .get_custom_fields();
263
264        // Copy values before mutable borrow
265        let surface = self.surface.to_string();
266        let byte_start = self.byte_start;
267        let byte_end = self.byte_end;
268        let word_id = self.word_id.id;
269
270        // Get details (requires mutable borrow)
271        let details = self.details();
272
273        // Build JSON object
274        let mut obj = serde_json::Map::new();
275
276        // Add surface as string
277        obj.insert("surface".to_string(), json!(surface));
278
279        // Add byte positions as numbers
280        obj.insert("byte_start".to_string(), json!(byte_start));
281        obj.insert("byte_end".to_string(), json!(byte_end));
282
283        // Add word_id as number
284        obj.insert("word_id".to_string(), json!(word_id));
285
286        // Add each custom field from the schema
287        for (i, field_name) in schema_custom_fields.iter().enumerate() {
288            if let Some(value) = details.get(i) {
289                // Try to parse as number if possible, otherwise keep as string
290                if let Ok(num) = value.parse::<i64>() {
291                    obj.insert(field_name.to_string(), json!(num));
292                } else if let Ok(num) = value.parse::<f64>() {
293                    obj.insert(field_name.to_string(), json!(num));
294                } else {
295                    obj.insert(field_name.to_string(), json!(*value));
296                }
297            }
298        }
299
300        Value::Object(obj)
301    }
302}