1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
use byteorder::{ByteOrder, LittleEndian};
use once_cell::sync::Lazy;
use serde::Serialize;
use crate::{dictionary::Dictionary, user_dictionary::UserDictionary, word_entry::WordId};
static UNK: Lazy<Vec<&str>> = Lazy::new(|| vec!["UNK"]);
#[derive(Serialize, Clone)]
pub struct Token<'a> {
pub text: &'a str,
pub byte_start: usize,
pub byte_end: usize,
pub position: usize,
pub position_length: usize,
pub word_id: WordId,
pub dictionary: &'a Dictionary,
pub user_dictionary: Option<&'a UserDictionary>,
details: Option<Vec<String>>,
}
impl<'a> Token<'a> {
pub fn new(
text: &'a str,
start: usize,
end: usize,
position: usize,
word_id: WordId,
dictionary: &'a Dictionary,
user_dictionary: Option<&'a UserDictionary>,
) -> Self {
Self {
text,
details: None,
byte_start: start,
byte_end: end,
position,
position_length: 1,
word_id,
dictionary,
user_dictionary,
}
}
fn details(&self) -> Option<Vec<&str>> {
match &self.details {
Some(details) => {
let mut v = Vec::new();
for detail in details.iter() {
let a = detail.as_str();
v.push(a);
}
Some(v)
}
None => None,
}
}
pub fn get_details(&mut self) -> Option<Vec<&str>> {
if self.details.is_some() {
return self.details();
}
if self.word_id.is_unknown() {
self.set_details(Some(UNK.iter().map(|v| v.to_string()).collect()));
return self.details();
}
let (words_idx_data, words_data) = if self.word_id.is_system() {
(
self.dictionary.words_idx_data.as_slice(),
self.dictionary.words_data.as_slice(),
)
} else {
match self.user_dictionary {
Some(user_dictionary) => (
user_dictionary.words_idx_data.as_slice(),
user_dictionary.words_data.as_slice(),
),
None => return None,
}
};
let idx = LittleEndian::read_u32(&words_idx_data[4 * self.word_id.0 as usize..][..4]);
let data = &words_data[idx as usize..];
self.details = match bincode::deserialize_from(data) {
Ok(details) => Some(details),
Err(_err) => None,
};
self.details()
}
pub fn set_details(&mut self, details: Option<Vec<String>>) -> &Token<'a> {
self.details = details;
self
}
}