1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
use std::borrow::Cow;

use byteorder::{ByteOrder, LittleEndian};
use once_cell::sync::Lazy;
use serde::Serialize;

use crate::{dictionary::Dictionary, user_dictionary::UserDictionary, word_entry::WordId};

static UNK: Lazy<Vec<&str>> = Lazy::new(|| vec!["UNK"]);

#[derive(Serialize, Clone)]
pub struct Token<'a> {
    /// Text content of the token.
    text: Cow<'a, str>,

    /// Starting position of the token in bytes.
    pub byte_start: usize,

    /// Ending position of the token in bytes.
    pub byte_end: usize,

    /// Position, expressed in number of tokens.
    pub position: usize,

    /// The length expressed in term of number of original tokens.
    pub position_length: usize,

    /// The ID of the word and a flag to indicate whether the word is registered in the dictionary.
    pub word_id: WordId,

    /// Reference of dictionary.
    pub dictionary: &'a Dictionary,

    /// Reference of user dictionary.
    pub user_dictionary: Option<&'a UserDictionary>,

    /// Detailes about the token.
    /// It contains metadata for tokens, such as part-of-speech information.
    details: Option<Vec<String>>,
}

impl<'a> Token<'a> {
    pub fn new(
        text: &str,
        start: usize,
        end: usize,
        position: usize,
        word_id: WordId,
        dictionary: &'a Dictionary,
        user_dictionary: Option<&'a UserDictionary>,
    ) -> Self {
        Self {
            text: Cow::Owned(text.to_string()),
            details: None,
            byte_start: start,
            byte_end: end,
            position,
            position_length: 1,
            word_id,
            dictionary,
            user_dictionary,
        }
    }

    pub fn get_text(&self) -> &str {
        self.text.as_ref()
    }

    // pub fn set_text(&mut self, text: &str) -> &Token<'a> {
    pub fn set_text(&mut self, text: String) -> &Token<'a> {
        self.text = Cow::Owned(text);
        self
    }

    fn details(&self) -> Option<Vec<&str>> {
        match &self.details {
            Some(details) => {
                let mut v = Vec::new();
                for detail in details.iter() {
                    let a = detail.as_str();
                    v.push(a);
                }
                Some(v)
            }
            None => None,
        }
    }

    // pub fn get_details(&mut self) -> Option<Vec<String>> {
    pub fn get_details(&mut self) -> Option<Vec<&str>> {
        if self.details.is_some() {
            return self.details();
        }

        if self.word_id.is_unknown() {
            self.set_details(Some(UNK.iter().map(|v| v.to_string()).collect()));
            return self.details();
        }

        let (words_idx_data, words_data) = if self.word_id.is_system() {
            (
                self.dictionary.words_idx_data.as_slice(),
                self.dictionary.words_data.as_slice(),
            )
        } else {
            match self.user_dictionary {
                Some(user_dictionary) => (
                    user_dictionary.words_idx_data.as_slice(),
                    user_dictionary.words_data.as_slice(),
                ),
                None => return None,
            }
        };

        let idx = LittleEndian::read_u32(&words_idx_data[4 * self.word_id.0 as usize..][..4]);
        let data = &words_data[idx as usize..];

        self.details = match bincode::deserialize_from(data) {
            Ok(details) => Some(details),
            Err(_err) => None,
        };

        self.details()
    }

    pub fn set_details(&mut self, details: Option<Vec<String>>) -> &Token<'a> {
        self.details = details;
        self
    }
}