1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
use std::borrow::Cow;
use byteorder::{ByteOrder, LittleEndian};
use once_cell::sync::Lazy;
use serde::Serialize;
use crate::{dictionary::Dictionary, user_dictionary::UserDictionary, word_entry::WordId};
static UNK: Lazy<Vec<&str>> = Lazy::new(|| vec!["UNK"]);
#[derive(Serialize, Clone)]
pub struct Token<'a> {
text: Cow<'a, str>,
pub byte_start: usize,
pub byte_end: usize,
pub position: usize,
pub position_length: usize,
pub word_id: WordId,
pub dictionary: &'a Dictionary,
pub user_dictionary: Option<&'a UserDictionary>,
details: Option<Vec<String>>,
}
impl<'a> Token<'a> {
pub fn new(
text: &str,
start: usize,
end: usize,
position: usize,
word_id: WordId,
dictionary: &'a Dictionary,
user_dictionary: Option<&'a UserDictionary>,
) -> Self {
Self {
text: Cow::Owned(text.to_string()),
details: None,
byte_start: start,
byte_end: end,
position,
position_length: 1,
word_id,
dictionary,
user_dictionary,
}
}
pub fn get_text(&self) -> &str {
self.text.as_ref()
}
pub fn set_text(&mut self, text: String) -> &Token<'a> {
self.text = Cow::Owned(text);
self
}
fn details(&self) -> Option<Vec<&str>> {
match &self.details {
Some(details) => {
let mut v = Vec::new();
for detail in details.iter() {
let a = detail.as_str();
v.push(a);
}
Some(v)
}
None => None,
}
}
pub fn get_details(&mut self) -> Option<Vec<&str>> {
if self.details.is_some() {
return self.details();
}
if self.word_id.is_unknown() {
self.set_details(Some(UNK.iter().map(|v| v.to_string()).collect()));
return self.details();
}
let (words_idx_data, words_data) = if self.word_id.is_system() {
(
self.dictionary.words_idx_data.as_slice(),
self.dictionary.words_data.as_slice(),
)
} else {
match self.user_dictionary {
Some(user_dictionary) => (
user_dictionary.words_idx_data.as_slice(),
user_dictionary.words_data.as_slice(),
),
None => return None,
}
};
let idx = LittleEndian::read_u32(&words_idx_data[4 * self.word_id.0 as usize..][..4]);
let data = &words_data[idx as usize..];
self.details = match bincode::deserialize_from(data) {
Ok(details) => Some(details),
Err(_err) => None,
};
self.details()
}
pub fn set_details(&mut self, details: Option<Vec<String>>) -> &Token<'a> {
self.details = details;
self
}
}