igo/dictionary/build/
charcategory.rs

1use std::path::Path;
2use std::path::PathBuf;
3use std::io::{BufWriter, Write};
4use std::fs::File;
5use std::cmp;
6use std::collections::HashMap;
7use std::error::Error;
8use std::rc::Rc;
9use byteorder::{WriteBytesExt, NativeEndian as NE};
10use crate::util::*;
11use crate::dictionary::build::*;
12use crate::dictionary::charcategory::{Category, SPACE_CHAR};
13use crate::trie::Searcher;
14
15pub const KEY_PREFIX: &str = "\x02";
16
17/// 文字カテゴリ定義を保持したバイナリデータを作成する
18pub struct CharCategory {
19    input_dir: PathBuf,
20    encoding: String,
21    output_dir: PathBuf
22}
23
24impl CharCategory {
25    /// コンストラクタ
26    /// # Arguments
27    /// * `input_dir`  - テキスト単語辞書が配置されているディレクトリのパス
28    /// * `encoding`   - テキスト単語辞書の文字列エンコーディング
29    /// * `output_dir` - バイナリ単語辞書の保存先ディレクトリ
30    pub fn new(input_dir: &Path, encoding: &str, output_dir: &Path) -> CharCategory {
31        CharCategory {
32            input_dir: input_dir.to_owned(),
33            encoding: encoding.to_owned(),
34            output_dir: output_dir.to_owned()
35        }
36    }
37
38    /// 文字カテゴリ定義のバイナリデータを作成する
39    pub fn build(self) -> AppResult<()> {
40        // 文字カテゴリの定義を取得する
41        let ccmap = self.parse_char_category_def()?;
42
43        {
44            // 文字カテゴリの定義を保存する
45            let mut categories: Vec<&Category> = Vec::new();
46            for e in ccmap.values() {
47                categories.push(e);
48            }
49            self.save_char_category_map(categories)?;
50        }
51
52        // 文字とカテゴリのマッピングを取得/保存する
53        self.build_code_category_map(ccmap)?;
54
55        Ok(())
56    }
57
58    fn parse_char_category_def(&self) -> AppResult<HashMap<String, Category>> {
59        let path = self.input_dir.join("char.def");
60        let too_few_fields = |rl: &ReadLine| -> AppError {
61            rl.parse_error("Invalid char category definition (too few fields).")
62        };
63        let parse_0or1 = |str: Option<&str>, rl: &ReadLine| -> AppResult<bool> {
64            str.ok_or_else(|| too_few_fields(rl))
65                .and_then(|s|
66                    if s == "1" {
67                        Ok(true)
68                    } else if s == "0" {
69                        Ok(false)
70                    } else {
71                        Err(rl.parse_error("Invalid char category definition (INVOKE must be '0' or '1')."))
72                    }
73                )
74        };
75        let mut rl = ReadLine::new(path.as_path(), &self.encoding)?;
76        let srch = Searcher::from_path(self.output_dir.join("word2id").as_path())?;
77        let mut map = HashMap::new();
78
79        let mut s = String::new();
80        loop {
81            let len = rl.next(&mut s).map_err(|e| rl.convert_error(e))?;
82            if len < 1 {
83                break;
84            }
85            let line = s.trim_end();
86            if line.is_empty() || line.starts_with('#') || line.starts_with('0') {
87                continue;
88            }
89
90            let mut ss = line.split_whitespace();
91            let name = ss.next().ok_or_else(|| too_few_fields(&rl))?;
92            let invoke: bool = parse_0or1(ss.next(), &rl)?;      // 0 or 1
93            let group: bool = parse_0or1(ss.next(), &rl)?;      // 0 or 1
94            // positive integer
95            let length: i32 = ss.next()
96                .ok_or_else(|| too_few_fields(&rl))
97                .and_then(|s| s.parse().map_err(AppError::from))?;
98            let key_utf16 = (KEY_PREFIX.to_string() + name).encode_utf16().collect::<Vec<_>>();
99            let id = srch.search(&key_utf16);
100
101            if length < 0 {
102                return Err(rl.parse_error("Invalid char category definition (LENGTH must be 0 or positive integer)."));
103            }
104            if id < 0 {
105                return Err(rl.parse_error(format!("Category '{}' is unregistered in trie", name)));
106            }
107            map.insert(name.to_string(), Category {
108                id,
109                length,
110                invoke,
111                group
112            });
113        }
114
115        // "DEFAULT"と"SPACE"は必須カテゴリ
116        if !map.contains_key("DEFAULT") {
117            return Err(rl.parse_error("Missing mandatory category 'DEFAULT'."));
118        }
119        if !map.contains_key("SPACE") {
120            return Err(rl.parse_error("Missing mandatory category 'SPACE'."));
121        }
122        Ok(map)
123    }
124
125    fn save_char_category_map(&self, mut categories: Vec<&Category>) -> AppResult<()> {
126        let mut writer = BufWriter::new(File::create(self.output_dir.join("char.category").as_path())?);
127        categories.sort();
128        for e in categories {
129            writer.write_i32::<NE>(e.id)?;
130            writer.write_i32::<NE>(e.length)?;
131            writer.write_i32::<NE>(if e.invoke { 1 } else { 0 })?;
132            writer.write_i32::<NE>(if e.group { 1 } else { 0 })?;
133        }
134        Ok(writer.flush()?)
135    }
136
137    fn build_code_category_map(&self, map: HashMap<String, Category>) -> AppResult<()> {
138        let mut chars: Vec<Rc<CharId>> = Vec::with_capacity(0x10_000);
139        {
140            let dft = Rc::new(CharId::new(map["DEFAULT"].id));
141            for _ in 0..0x10_000 {
142                chars.push(dft.clone());
143            }
144        }
145
146        {
147            let path = self.input_dir.join("char.def");
148            let mut rl = ReadLine::new(path.as_path(), &self.encoding)?;
149            let mut s = String::new();
150            loop {
151                let len = rl.next(&mut s).map_err(|e| rl.parse_error(e.description()))?;
152                if len < 1 {
153                    break;
154                }
155                let line = s.trim_end();
156                if line.is_empty() || !line.starts_with('0') {
157                    continue;
158                }
159
160                let mut ss = line.split_whitespace();
161                let beg: i32;
162                let end: i32;
163                let ss0 = ss.next().ok_or_else(|| rl.parse_error("Too few fields"))?;
164                if let Some(idx) = ss0.find("..") {
165                    beg = i32::from_str_radix(&ss0[2..idx], 16)
166                        .map_err(|e| rl.convert_error(e))?;
167                    end = i32::from_str_radix(&ss0[(idx + 2 + 2)..], 16)
168                        .map_err(|e| rl.convert_error(e))?;
169                } else {
170                    beg = i32::from_str_radix(&ss0[2..], 16)
171                        .map_err(|e| rl.convert_error(e))?;
172                    end = beg;
173                }
174
175                if !(0 <= beg && beg <= 0xFFFF &&
176                    0 <= end && end <= 0xFFFF && beg <= end) {
177                    return Err(rl.parse_error("Wrong UCS2 code specified."));
178                }
179
180                // 文字カテゴリ及び互換カテゴリの取得
181                let category_name = ss.next().ok_or_else(|| rl.parse_error("Too few fields"))?;
182                let category = map.get(category_name).ok_or_else(|| rl.parse_error(format!("Category '{}' is undefined.", category_name)))?;
183                let ch = {
184                    let mut ch = CharId::new(category.id);
185                    while let Some(f) = ss.next() {
186                        if f.starts_with('#') { break; }
187                        let category = map.get(f).ok_or_else(|| rl.parse_error(format!("Category '{}' is undefined.", f)))?;
188                        ch.add(category.id);
189                    }
190                    Rc::new(ch)
191                };
192
193                // カテゴリ登録
194                for i in beg..=end {
195                    chars[i as usize] = ch.clone();
196                }
197            }
198
199            if chars[SPACE_CHAR as usize].id != map["SPACE"].id {
200                return Err(rl.parse_error("0x0020 is reserved for 'SPACE' category"));
201            }
202        }
203
204        let mut writer = BufWriter::new(File::create(self.output_dir.join("code2category").as_path())?);
205        for c in &chars {
206            writer.write_i32::<NE>(c.id)?;
207        }
208        for c in &chars {
209            writer.write_i32::<NE>(c.mask)?;
210        }
211
212        Ok(writer.flush()?)
213    }
214}
215
216
217impl cmp::Ord for Category {
218    fn cmp(&self, other: &Self) -> cmp::Ordering {
219        self.id.cmp(&other.id)
220    }
221}
222
223impl cmp::PartialOrd for Category {
224    fn partial_cmp(&self, other: &Self) -> Option<cmp::Ordering> {
225        Some(self.cmp(other))
226    }
227}
228
229impl cmp::PartialEq for Category {
230    fn eq(&self, other: &Self) -> bool {
231        self.id.eq(&other.id)
232    }
233}
234
235impl cmp::Eq for Category {}
236
237
238struct CharId {
239    id: i32,
240    mask: i32
241}
242
243impl CharId {
244    pub fn new(id: i32) -> CharId {
245        let mut c = CharId { id, mask: 0 };
246        c.add(id);
247        c
248    }
249
250    pub fn add(&mut self, i: i32) {
251        self.mask |= 1 << i;
252    }
253}