1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
use std::io::{self, BufReader};
use std::fs::{self, File};
use std::path::Path;
use trie::{Searcher};
use dictionary;
use dictionary::ViterbiNode;
use util::*;
use {Utf16Char, Utf16String};
pub struct WordDic {
trie: Searcher,
data: String,
indices: Box<[i32]>,
costs: Box<[i16]>,
left_ids: Box<[i16]>,
right_ids: Box<[i16]>,
data_offsets: Box<[i32]>
}
impl WordDic {
pub fn new(data_dir: &Path) -> io::Result<WordDic> {
let word2id_path = data_dir.join("word2id");
let dat_path = data_dir.join("word.dat");
let idx_path = data_dir.join("word.ary.idx");
let inf_path = data_dir.join("word.inf");
let metadata = fs::metadata(&inf_path)?;
let mut reader = BufReader::new(File::open(&inf_path)?);
let word_count = (metadata.len() / (4 + 2 + 2 + 2)) as usize;
debug!("word_count: {}", word_count);
let word_data = read_all_as_chars(&dat_path)?;
let data_offsets = reader.get_int_array(word_count)?;
let (word_data, data_offsets) = convert2utf8_data(&word_data, &data_offsets);
Ok(WordDic {
trie: Searcher::new(&word2id_path)?,
data: word_data,
indices: read_all_as_int_array(&idx_path)?,
data_offsets: data_offsets,
left_ids: reader.get_short_array(word_count)?,
right_ids: reader.get_short_array(word_count)?,
costs: reader.get_short_array(word_count)?
})
}
pub fn search(&self, text: &Utf16String, start: usize, callback: &mut dictionary::Callback) {
self.trie.each_common_prefix(text, start, |start: usize, offset: i32, trie_id: i32| {
let trie_id = trie_id as usize;
let end: i32 = self.indices[trie_id + 1];
for i in self.indices[trie_id]..end {
let idx = i as usize;
callback.call(ViterbiNode {
word_id: i,
start: start,
length: offset as i16,
cost: self.costs[idx] as i32,
left_id: self.left_ids[idx],
right_id: self.right_ids[idx],
is_space: false,
prev: None
});
}
});
}
pub fn search_from_trie_id(&self, trie_id: i32, start: usize, word_length: usize,
is_space: bool, callback: &mut dictionary::Callback) {
let trie_id = trie_id as usize;
let end = self.indices[trie_id + 1];
for i in self.indices[trie_id]..end {
let idx = i as usize;
callback.call(ViterbiNode {
word_id: i,
start: start,
length: word_length as i16,
cost: self.costs[idx] as i32,
left_id: self.left_ids[idx],
right_id: self.right_ids[idx],
is_space: is_space,
prev: None
});
}
}
pub fn word_data(&self, word_id: i32) -> &str {
let word_id = word_id as usize;
&self.data[
(self.data_offsets[word_id] as usize) .. (self.data_offsets[word_id + 1] as usize)]
}
}
fn convert2utf8_data(utf16_str: &[Utf16Char], offsets: &[i32]) -> (String, Box<[i32]>) {
let mut buf = String::with_capacity(utf16_str.len() * 3);
let mut new_offset = vec![0i32; offsets.len()];
for word_id in 0..(offsets.len() - 1) {
let offset = offsets[word_id] as usize;
let next_offset = offsets[word_id + 1] as usize;
let word_data = String::from_utf16_lossy(&utf16_str[offset..next_offset]);
buf.push_str(&word_data);
new_offset[word_id + 1] = buf.len() as i32;
}
debug!("buf size: {} / {}", buf.len(), buf.capacity());
(buf, new_offset.into_boxed_slice())
}