1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
use byteorder::{ByteOrder, LittleEndian};
use once_cell::sync::Lazy;
use serde::Serialize;

use crate::{dictionary::Dictionary, user_dictionary::UserDictionary, word_entry::WordId};

static UNK: Lazy<Vec<&str>> = Lazy::new(|| vec!["UNK"]);

#[derive(Serialize, Clone)]
pub struct Token<'a> {
    /// Text content of the token.
    pub text: &'a str,

    /// Starting position of the token in bytes.
    pub byte_start: usize,

    /// Ending position of the token in bytes.
    pub byte_end: usize,

    /// Position, expressed in number of tokens.
    pub position: usize,

    /// The length expressed in term of number of original tokens.
    pub position_length: usize,

    /// The ID of the word and a flag to indicate whether the word is registered in the dictionary.
    pub word_id: WordId,

    /// Reference of dictionary.
    pub dictionary: &'a Dictionary,

    /// Reference of user dictionary.
    pub user_dictionary: Option<&'a UserDictionary>,

    /// Detailes about the token.
    /// It contains metadata for tokens, such as part-of-speech information.
    details: Option<Vec<String>>,
}

impl<'a> Token<'a> {
    pub fn new(
        text: &'a str,
        start: usize,
        end: usize,
        position: usize,
        word_id: WordId,
        dictionary: &'a Dictionary,
        user_dictionary: Option<&'a UserDictionary>,
    ) -> Self {
        Self {
            text,
            details: None,
            byte_start: start,
            byte_end: end,
            position,
            position_length: 1,
            word_id,
            dictionary,
            user_dictionary,
        }
    }

    fn details(&self) -> Option<Vec<&str>> {
        match &self.details {
            Some(details) => {
                let mut v = Vec::new();
                for detail in details.iter() {
                    let a = detail.as_str();
                    v.push(a);
                }
                Some(v)
            }
            None => None,
        }
    }

    // pub fn get_details(&mut self) -> Option<Vec<String>> {
    pub fn get_details(&mut self) -> Option<Vec<&str>> {
        if self.details.is_some() {
            return self.details();
        }

        if self.word_id.is_unknown() {
            self.set_details(Some(UNK.iter().map(|v| v.to_string()).collect()));
            return self.details();
        }

        let (words_idx_data, words_data) = if self.word_id.is_system() {
            (
                self.dictionary.words_idx_data.as_slice(),
                self.dictionary.words_data.as_slice(),
            )
        } else {
            match self.user_dictionary {
                Some(user_dictionary) => (
                    user_dictionary.words_idx_data.as_slice(),
                    user_dictionary.words_data.as_slice(),
                ),
                None => return None,
            }
        };

        let idx = LittleEndian::read_u32(&words_idx_data[4 * self.word_id.0 as usize..][..4]);
        let data = &words_data[idx as usize..];

        self.details = match bincode::deserialize_from(data) {
            Ok(details) => Some(details),
            Err(_err) => None,
        };

        self.details()
    }

    pub fn set_details(&mut self, details: Option<Vec<String>>) -> &Token<'a> {
        self.details = details;
        self
    }
}