lindera_tokenizer/
token.rs

1use once_cell::sync::Lazy;
2use serde::Serialize;
3
4use lindera_core::dictionary::{Dictionary, UserDictionary};
5use lindera_core::word_entry::WordId;
6
7static UNK: Lazy<Vec<&str>> = Lazy::new(|| vec!["UNK"]);
8
9#[derive(Serialize, Clone)]
10pub struct Token<'a> {
11    /// Text content of the token.
12    pub text: &'a str,
13
14    /// Starting position of the token in bytes.
15    pub byte_start: usize,
16
17    /// Ending position of the token in bytes.
18    pub byte_end: usize,
19
20    /// Position, expressed in number of tokens.
21    pub position: usize,
22
23    /// The length expressed in terms of number of original tokens.
24    pub position_length: usize,
25
26    /// The ID of the word and a flag to indicate whether the word is registered in the dictionary.
27    pub word_id: WordId,
28
29    /// Reference of dictionary.
30    pub dictionary: &'a Dictionary,
31
32    /// Reference of user dictionary.
33    pub user_dictionary: Option<&'a UserDictionary>,
34
35    /// Detailes about the token.
36    /// It contains metadata for tokens, such as part-of-speech information.
37    details: Option<Vec<String>>,
38}
39
40impl<'a> Token<'a> {
41    pub fn new(
42        text: &'a str,
43        start: usize,
44        end: usize,
45        position: usize,
46        word_id: WordId,
47        dictionary: &'a Dictionary,
48        user_dictionary: Option<&'a UserDictionary>,
49    ) -> Self {
50        Self {
51            text,
52            details: None,
53            byte_start: start,
54            byte_end: end,
55            position,
56            position_length: 1,
57            word_id,
58            dictionary,
59            user_dictionary,
60        }
61    }
62
63    fn details(&self) -> Option<Vec<&str>> {
64        match &self.details {
65            Some(details) => {
66                let mut v = Vec::new();
67                for detail in details.iter() {
68                    let a = detail.as_str();
69                    v.push(a);
70                }
71                Some(v)
72            }
73            None => None,
74        }
75    }
76
77    // pub fn get_details(&mut self) -> Option<Vec<String>> {
78    pub fn get_details(&mut self) -> Option<Vec<&str>> {
79        if self.details.is_some() {
80            return self.details();
81        }
82
83        if self.word_id.is_unknown() {
84            self.set_details(Some(UNK.iter().map(|v| v.to_string()).collect()));
85            return self.details();
86        }
87
88        self.details = if self.word_id.is_system() {
89            self.dictionary.word_details(self.word_id.0 as usize)
90        } else {
91            match self.user_dictionary {
92                Some(user_dictionary) => user_dictionary.word_details(self.word_id.0 as usize),
93                None => None,
94            }
95        };
96        self.details()
97    }
98
99    pub fn set_details(&mut self, details: Option<Vec<String>>) -> &Token<'a> {
100        self.details = details;
101        self
102    }
103}