igo/dictionary/build/
charcategory.rs1use std::path::Path;
2use std::path::PathBuf;
3use std::io::{BufWriter, Write};
4use std::fs::File;
5use std::cmp;
6use std::collections::HashMap;
7use std::error::Error;
8use std::rc::Rc;
9use byteorder::{WriteBytesExt, NativeEndian as NE};
10use crate::util::*;
11use crate::dictionary::build::*;
12use crate::dictionary::charcategory::{Category, SPACE_CHAR};
13use crate::trie::Searcher;
14
15pub const KEY_PREFIX: &str = "\x02";
16
17pub struct CharCategory {
19 input_dir: PathBuf,
20 encoding: String,
21 output_dir: PathBuf
22}
23
24impl CharCategory {
25 pub fn new(input_dir: &Path, encoding: &str, output_dir: &Path) -> CharCategory {
31 CharCategory {
32 input_dir: input_dir.to_owned(),
33 encoding: encoding.to_owned(),
34 output_dir: output_dir.to_owned()
35 }
36 }
37
38 pub fn build(self) -> AppResult<()> {
40 let ccmap = self.parse_char_category_def()?;
42
43 {
44 let mut categories: Vec<&Category> = Vec::new();
46 for e in ccmap.values() {
47 categories.push(e);
48 }
49 self.save_char_category_map(categories)?;
50 }
51
52 self.build_code_category_map(ccmap)?;
54
55 Ok(())
56 }
57
58 fn parse_char_category_def(&self) -> AppResult<HashMap<String, Category>> {
59 let path = self.input_dir.join("char.def");
60 let too_few_fields = |rl: &ReadLine| -> AppError {
61 rl.parse_error("Invalid char category definition (too few fields).")
62 };
63 let parse_0or1 = |str: Option<&str>, rl: &ReadLine| -> AppResult<bool> {
64 str.ok_or_else(|| too_few_fields(rl))
65 .and_then(|s|
66 if s == "1" {
67 Ok(true)
68 } else if s == "0" {
69 Ok(false)
70 } else {
71 Err(rl.parse_error("Invalid char category definition (INVOKE must be '0' or '1')."))
72 }
73 )
74 };
75 let mut rl = ReadLine::new(path.as_path(), &self.encoding)?;
76 let srch = Searcher::from_path(self.output_dir.join("word2id").as_path())?;
77 let mut map = HashMap::new();
78
79 let mut s = String::new();
80 loop {
81 let len = rl.next(&mut s).map_err(|e| rl.convert_error(e))?;
82 if len < 1 {
83 break;
84 }
85 let line = s.trim_end();
86 if line.is_empty() || line.starts_with('#') || line.starts_with('0') {
87 continue;
88 }
89
90 let mut ss = line.split_whitespace();
91 let name = ss.next().ok_or_else(|| too_few_fields(&rl))?;
92 let invoke: bool = parse_0or1(ss.next(), &rl)?; let group: bool = parse_0or1(ss.next(), &rl)?; let length: i32 = ss.next()
96 .ok_or_else(|| too_few_fields(&rl))
97 .and_then(|s| s.parse().map_err(AppError::from))?;
98 let key_utf16 = (KEY_PREFIX.to_string() + name).encode_utf16().collect::<Vec<_>>();
99 let id = srch.search(&key_utf16);
100
101 if length < 0 {
102 return Err(rl.parse_error("Invalid char category definition (LENGTH must be 0 or positive integer)."));
103 }
104 if id < 0 {
105 return Err(rl.parse_error(format!("Category '{}' is unregistered in trie", name)));
106 }
107 map.insert(name.to_string(), Category {
108 id,
109 length,
110 invoke,
111 group
112 });
113 }
114
115 if !map.contains_key("DEFAULT") {
117 return Err(rl.parse_error("Missing mandatory category 'DEFAULT'."));
118 }
119 if !map.contains_key("SPACE") {
120 return Err(rl.parse_error("Missing mandatory category 'SPACE'."));
121 }
122 Ok(map)
123 }
124
125 fn save_char_category_map(&self, mut categories: Vec<&Category>) -> AppResult<()> {
126 let mut writer = BufWriter::new(File::create(self.output_dir.join("char.category").as_path())?);
127 categories.sort();
128 for e in categories {
129 writer.write_i32::<NE>(e.id)?;
130 writer.write_i32::<NE>(e.length)?;
131 writer.write_i32::<NE>(if e.invoke { 1 } else { 0 })?;
132 writer.write_i32::<NE>(if e.group { 1 } else { 0 })?;
133 }
134 Ok(writer.flush()?)
135 }
136
137 fn build_code_category_map(&self, map: HashMap<String, Category>) -> AppResult<()> {
138 let mut chars: Vec<Rc<CharId>> = Vec::with_capacity(0x10_000);
139 {
140 let dft = Rc::new(CharId::new(map["DEFAULT"].id));
141 for _ in 0..0x10_000 {
142 chars.push(dft.clone());
143 }
144 }
145
146 {
147 let path = self.input_dir.join("char.def");
148 let mut rl = ReadLine::new(path.as_path(), &self.encoding)?;
149 let mut s = String::new();
150 loop {
151 let len = rl.next(&mut s).map_err(|e| rl.parse_error(e.description()))?;
152 if len < 1 {
153 break;
154 }
155 let line = s.trim_end();
156 if line.is_empty() || !line.starts_with('0') {
157 continue;
158 }
159
160 let mut ss = line.split_whitespace();
161 let beg: i32;
162 let end: i32;
163 let ss0 = ss.next().ok_or_else(|| rl.parse_error("Too few fields"))?;
164 if let Some(idx) = ss0.find("..") {
165 beg = i32::from_str_radix(&ss0[2..idx], 16)
166 .map_err(|e| rl.convert_error(e))?;
167 end = i32::from_str_radix(&ss0[(idx + 2 + 2)..], 16)
168 .map_err(|e| rl.convert_error(e))?;
169 } else {
170 beg = i32::from_str_radix(&ss0[2..], 16)
171 .map_err(|e| rl.convert_error(e))?;
172 end = beg;
173 }
174
175 if !(0 <= beg && beg <= 0xFFFF &&
176 0 <= end && end <= 0xFFFF && beg <= end) {
177 return Err(rl.parse_error("Wrong UCS2 code specified."));
178 }
179
180 let category_name = ss.next().ok_or_else(|| rl.parse_error("Too few fields"))?;
182 let category = map.get(category_name).ok_or_else(|| rl.parse_error(format!("Category '{}' is undefined.", category_name)))?;
183 let ch = {
184 let mut ch = CharId::new(category.id);
185 while let Some(f) = ss.next() {
186 if f.starts_with('#') { break; }
187 let category = map.get(f).ok_or_else(|| rl.parse_error(format!("Category '{}' is undefined.", f)))?;
188 ch.add(category.id);
189 }
190 Rc::new(ch)
191 };
192
193 for i in beg..=end {
195 chars[i as usize] = ch.clone();
196 }
197 }
198
199 if chars[SPACE_CHAR as usize].id != map["SPACE"].id {
200 return Err(rl.parse_error("0x0020 is reserved for 'SPACE' category"));
201 }
202 }
203
204 let mut writer = BufWriter::new(File::create(self.output_dir.join("code2category").as_path())?);
205 for c in &chars {
206 writer.write_i32::<NE>(c.id)?;
207 }
208 for c in &chars {
209 writer.write_i32::<NE>(c.mask)?;
210 }
211
212 Ok(writer.flush()?)
213 }
214}
215
216
217impl cmp::Ord for Category {
218 fn cmp(&self, other: &Self) -> cmp::Ordering {
219 self.id.cmp(&other.id)
220 }
221}
222
223impl cmp::PartialOrd for Category {
224 fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
225 Some(self.cmp(other))
226 }
227}
228
229impl cmp::PartialEq for Category {
230 fn eq(&self, other: &Self) -> bool {
231 self.id.eq(&other.id)
232 }
233}
234
235impl cmp::Eq for Category {}
236
237
238struct CharId {
239 id: i32,
240 mask: i32
241}
242
243impl CharId {
244 pub fn new(id: i32) -> CharId {
245 let mut c = CharId { id, mask: 0 };
246 c.add(id);
247 c
248 }
249
250 pub fn add(&mut self, i: i32) {
251 self.mask |= 1 << i;
252 }
253}