igo/dictionary/
worddic.rs1use std::io::{self, BufReader};
2use crate::trie::Searcher;
3use crate::dictionary;
4use crate::dictionary::ViterbiNode;
5use crate::util::*;
6use crate::{Utf16Char, Utf16Str};
7use log::debug;
8
9
10pub struct WordDic {
11 trie: Searcher,
12 data: String,
13 indices: Box<[i32]>,
14
15 costs: Box<[i16]>,
17 left_ids: Box<[i16]>,
19 right_ids: Box<[i16]>,
21 data_offsets: Box<[i32]>
23}
24
25impl WordDic {
26 pub fn new(dir: &mut dyn DirLike) -> io::Result<WordDic> {
27 let word2id_path = "word2id";
28 let dat_path ="word.dat";
29 let idx_path = "word.ary.idx";
30 let inf_path = "word.inf";
31
32 let inf_size = dir.file_size(inf_path)?;
33 let mut reader = BufReader::new(dir.open(inf_path)?);
34 let word_count = (inf_size / (4 + 2 + 2 + 2)) as usize;
35 debug!("word_count: {}", word_count);
36
37 let word_data = read_all_as_chars(dir, dat_path)?;
38 let data_offsets = reader.get_int_array(word_count)?;
39 let (word_data, data_offsets) = convert2utf8_data(&word_data, &data_offsets);
40
41 Ok(WordDic {
42 trie: Searcher::new(dir.open(word2id_path)?)?,
43 data: word_data,
44 indices: read_all_as_int_array(dir, idx_path)?,
45
46 data_offsets,
47 left_ids: reader.get_short_array(word_count)?,
48 right_ids: reader.get_short_array(word_count)?,
49 costs: reader.get_short_array(word_count)?
50 })
51 }
52
53
54 pub fn search(&self, text: &Utf16Str, start: usize, callback: &mut dyn dictionary::Callback) {
55 self.trie.each_common_prefix(text, start, |start: usize, offset: i32, trie_id: i32| {
56 let trie_id = trie_id as usize;
65 let end: i32 = self.indices[trie_id + 1];
66
67 for i in self.indices[trie_id]..end {
68 let idx = i as usize;
69 callback.call(ViterbiNode {
70 word_id: i,
71 start,
72 length: offset as i16,
73 cost: i32::from(self.costs[idx]),
74 left_id: self.left_ids[idx],
75 right_id: self.right_ids[idx],
76 is_space: false,
77 prev: None
78 });
79 }
80 });
81 }
82
83 pub fn search_from_trie_id(&self, trie_id: i32, start: usize, word_length: usize,
84 is_space: bool, callback: &mut dyn dictionary::Callback) {
85 let trie_id = trie_id as usize;
86 let end = self.indices[trie_id + 1];
87 for i in self.indices[trie_id]..end {
88 let idx = i as usize;
89 callback.call(ViterbiNode {
90 word_id: i,
91 start,
92 length: word_length as i16,
93 cost: i32::from(self.costs[idx]),
94 left_id: self.left_ids[idx],
95 right_id: self.right_ids[idx],
96 is_space,
97 prev: None
98 });
99 }
100 }
101
102 pub fn word_data(&self, word_id: i32) -> &str {
103 let word_id = word_id as usize;
104 &self.data[
105 (self.data_offsets[word_id] as usize) .. (self.data_offsets[word_id + 1] as usize)]
106 }
107}
108
109fn convert2utf8_data(utf16_str: &[Utf16Char], offsets: &[i32]) -> (String, Box<[i32]>) {
111 let mut buf = String::with_capacity(utf16_str.len() * 3);
112 let mut new_offset = vec![0i32; offsets.len()];
113
114 for word_id in 0..(offsets.len() - 1) {
115 let offset = offsets[word_id] as usize;
116 let next_offset = offsets[word_id + 1] as usize;
117 let word_data = String::from_utf16_lossy(&utf16_str[offset..next_offset]);
118 buf.push_str(&word_data);
119 new_offset[word_id + 1] = buf.len() as i32;
120 }
121 debug!("buf size: {} / {}", buf.len(), buf.capacity());
122
123 (buf, new_offset.into_boxed_slice())
124}