jpreprocess_dictionary_builder/
ipadic_builder.rs1use std::{
2 fs::{self, File},
3 io::{self, Write},
4 path::{Path, PathBuf},
5 str::FromStr,
6};
7
8use jpreprocess_dictionary::{serializer::lindera::LinderaSerializer, DictionarySerializer};
9use rayon::prelude::*;
10
11use byteorder::{LittleEndian, WriteBytesExt};
12use csv::StringRecord;
13use glob::glob;
14use log::debug;
15
16use lindera_core::{
17 character_definition::{CharacterDefinitions, CharacterDefinitionsBuilder},
18 dictionary::UserDictionary,
19 dictionary_builder::DictionaryBuilder,
20 error::LinderaErrorKind,
21 file_util::read_utf8_file,
22 unknown_dictionary::parse_unk,
23 LinderaResult,
24};
25
26use crate::build_dict::*;
27
28pub struct IpadicBuilder {
29 serializer: Box<dyn DictionarySerializer + Send + Sync>,
30}
31
32impl IpadicBuilder {
33 const UNK_FIELDS_NUM: usize = 11;
34
35 pub fn new(serializer: Box<dyn Send + Sync + DictionarySerializer>) -> Self {
36 IpadicBuilder { serializer }
37 }
38
39 fn write_words(
40 &self,
41 wtr_words_path: &Path,
42 wtr_words_idx_path: &Path,
43 is_system: bool,
44 normalized_rows: &Vec<Vec<String>>,
45 ) -> Result<(), lindera_core::error::LinderaError> {
46 let mut wtr_words = io::BufWriter::new(
47 File::create(wtr_words_path)
48 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
49 );
50 let mut wtr_words_idx = io::BufWriter::new(
51 File::create(wtr_words_idx_path)
52 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
53 );
54
55 let (words_idx_buffer, words_buffer) =
56 build_words(&self.serializer, normalized_rows, is_system)?;
57
58 write(&words_buffer, &mut wtr_words)?;
59 write(&words_idx_buffer, &mut wtr_words_idx)?;
60 wtr_words
61 .flush()
62 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
63 wtr_words_idx
64 .flush()
65 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
66 Ok(())
67 }
68
69 pub fn build_user_dict_from_data(
70 &self,
71 rows: &Vec<Vec<&str>>,
72 ) -> LinderaResult<UserDictionary> {
73 let mut normalized_rows: Vec<Vec<String>> = normalize_rows(rows);
74 normalized_rows.par_sort_by_key(|row| row.first().map(|s| s.to_string()));
75 let (words_idx_data, words_data) = build_words(&self.serializer, &normalized_rows, false)?;
76 let dict = build_prefix_dict(build_word_entry_map(&normalized_rows, false)?, false)?;
77 Ok(UserDictionary {
78 dict,
79 words_idx_data,
80 words_data,
81 })
82 }
83}
84
85impl Default for IpadicBuilder {
86 fn default() -> Self {
87 Self::new(Box::new(LinderaSerializer))
88 }
89}
90
91impl DictionaryBuilder for IpadicBuilder {
92 fn build_dictionary(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
93 fs::create_dir_all(output_dir)
94 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
95
96 let chardef = self.build_chardef(input_dir, output_dir)?;
97 self.build_unk(input_dir, &chardef, output_dir)?;
98 self.build_dict(input_dir, output_dir)?;
99 self.build_cost_matrix(input_dir, output_dir)?;
100
101 Ok(())
102 }
103
104 fn build_user_dictionary(&self, input_file: &Path, output_file: &Path) -> LinderaResult<()> {
105 let parent_dir = match output_file.parent() {
106 Some(parent_dir) => parent_dir,
107 None => {
108 return Err(LinderaErrorKind::Io.with_error(anyhow::anyhow!(
109 "failed to get parent directory of output file"
110 )))
111 }
112 };
113 fs::create_dir_all(parent_dir)
114 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
115
116 let user_dict = self.build_user_dict(input_file)?;
117
118 let mut wtr = io::BufWriter::new(
119 File::create(output_file)
120 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
121 );
122 bincode::serialize_into(&mut wtr, &user_dict)
123 .map_err(|err| LinderaErrorKind::Serialize.with_error(anyhow::anyhow!(err)))?;
124 wtr.flush()
125 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
126
127 Ok(())
128 }
129
130 fn build_chardef(
131 &self,
132 input_dir: &Path,
133 output_dir: &Path,
134 ) -> LinderaResult<CharacterDefinitions> {
135 let char_def_path = input_dir.join("char.def");
136 debug!("reading {:?}", char_def_path);
137
138 let char_def = read_utf8_file(&char_def_path)?;
139 let mut char_definitions_builder = CharacterDefinitionsBuilder::default();
140 char_definitions_builder.parse(&char_def)?;
141 let char_definitions = char_definitions_builder.build();
142
143 let mut chardef_buffer = Vec::new();
144 bincode::serialize_into(&mut chardef_buffer, &char_definitions)
145 .map_err(|err| LinderaErrorKind::Serialize.with_error(anyhow::anyhow!(err)))?;
146
147 let wtr_chardef_path = output_dir.join(Path::new("char_def.bin"));
148 let mut wtr_chardef = io::BufWriter::new(
149 File::create(wtr_chardef_path)
150 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
151 );
152
153 write(&chardef_buffer, &mut wtr_chardef)?;
154
155 wtr_chardef
156 .flush()
157 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
158
159 Ok(char_definitions)
160 }
161
162 fn build_unk(
163 &self,
164 input_dir: &Path,
165 chardef: &CharacterDefinitions,
166 output_dir: &Path,
167 ) -> LinderaResult<()> {
168 let unk_data_path = input_dir.join("unk.def");
169 debug!("reading {:?}", unk_data_path);
170
171 let unk_data = read_utf8_file(&unk_data_path)?;
172 let unknown_dictionary = parse_unk(chardef.categories(), &unk_data, Self::UNK_FIELDS_NUM)?;
173
174 let mut unk_buffer = Vec::new();
175 bincode::serialize_into(&mut unk_buffer, &unknown_dictionary)
176 .map_err(|err| LinderaErrorKind::Serialize.with_error(anyhow::anyhow!(err)))?;
177
178 let wtr_unk_path = output_dir.join(Path::new("unk.bin"));
179 let mut wtr_unk = io::BufWriter::new(
180 File::create(wtr_unk_path)
181 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
182 );
183 write(&unk_buffer, &mut wtr_unk)?;
184 wtr_unk
185 .flush()
186 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
187
188 Ok(())
189 }
190
191 fn build_dict(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
192 let pattern = if let Some(path) = input_dir.to_str() {
193 format!("{}/*.csv", path)
194 } else {
195 return Err(
196 LinderaErrorKind::Io.with_error(anyhow::anyhow!("Failed to convert path to &str."))
197 );
198 };
199
200 let mut filenames: Vec<PathBuf> = Vec::new();
201 for entry in
202 glob(&pattern).map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?
203 {
204 match entry {
205 Ok(path) => {
206 if let Some(filename) = path.file_name() {
207 filenames.push(Path::new(input_dir).join(filename));
208 } else {
209 return Err(LinderaErrorKind::Io
210 .with_error(anyhow::anyhow!("failed to get filename")));
211 }
212 }
213 Err(err) => return Err(LinderaErrorKind::Content.with_error(anyhow::anyhow!(err))),
214 }
215 }
216
217 let mut rows: Vec<StringRecord> = vec![];
218 for filename in filenames {
219 debug!("reading {:?}", filename);
220
221 let mut rdr = csv::ReaderBuilder::new()
222 .has_headers(false)
223 .from_path(filename)
224 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
225
226 for result in rdr.records() {
227 let record = result
228 .map_err(|err| LinderaErrorKind::Content.with_error(anyhow::anyhow!(err)))?;
229 rows.push(record);
230 }
231 }
232
233 let mut normalized_rows: Vec<Vec<String>> = normalize_rows(&rows);
234
235 normalized_rows.par_sort_by_key(|row| row.first().map(|s| s.to_string()));
236
237 let wtr_da_path = output_dir.join(Path::new("dict.da"));
238 let mut wtr_da = io::BufWriter::new(
239 File::create(wtr_da_path)
240 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
241 );
242
243 let wtr_vals_path = output_dir.join(Path::new("dict.vals"));
244 let mut wtr_vals = io::BufWriter::new(
245 File::create(wtr_vals_path)
246 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
247 );
248
249 self.write_words(
250 output_dir.join(Path::new("dict.words")).as_path(),
251 output_dir.join(Path::new("dict.wordsidx")).as_path(),
252 true,
253 &normalized_rows,
254 )?;
255
256 let prefix_dict = build_prefix_dict(build_word_entry_map(&normalized_rows, true)?, true)?;
257
258 write(&prefix_dict.da.0, &mut wtr_da)?;
259
260 write(&prefix_dict.vals_data, &mut wtr_vals)?;
261
262 wtr_vals
263 .flush()
264 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
265
266 Ok(())
267 }
268
269 fn build_cost_matrix(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
270 let matrix_data_path = input_dir.join("matrix.def");
271 debug!("reading {:?}", matrix_data_path);
272
273 let matrix_data = read_utf8_file(&matrix_data_path)?;
274 let mut lines_it = matrix_data
275 .par_lines()
276 .map(|line| {
277 line.split_whitespace()
278 .map(i32::from_str)
279 .collect::<Result<Vec<i32>, _>>()
280 .map_err(|err| LinderaErrorKind::Parse.with_error(anyhow::anyhow!(err)))
281 })
282 .collect::<Result<Vec<_>, _>>()?
283 .into_iter();
284 let header = lines_it.next().ok_or_else(|| {
285 LinderaErrorKind::Content.with_error(anyhow::anyhow!("unknown error"))
286 })?;
287 let forward_size = header[0] as u32;
288 let backward_size = header[1] as u32;
289 let len = 2 + (forward_size * backward_size) as usize;
290 let mut costs = vec![i16::MAX; len];
291 costs[0] = forward_size as i16;
292 costs[1] = backward_size as i16;
293 for fields in lines_it {
294 let forward_id = fields[0] as u32;
295 let backward_id = fields[1] as u32;
296 let cost = fields[2] as u16;
297 costs[2 + (backward_id + forward_id * backward_size) as usize] = cost as i16;
298 }
299
300 let wtr_matrix_mtx_path = output_dir.join(Path::new("matrix.mtx"));
301 let mut wtr_matrix_mtx = io::BufWriter::new(
302 File::create(wtr_matrix_mtx_path)
303 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
304 );
305 let mut matrix_mtx_buffer = Vec::new();
306 for cost in costs {
307 matrix_mtx_buffer
308 .write_i16::<LittleEndian>(cost)
309 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
310 }
311 write(&matrix_mtx_buffer, &mut wtr_matrix_mtx)?;
312
313 wtr_matrix_mtx
314 .flush()
315 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
316
317 Ok(())
318 }
319
320 fn build_user_dict(&self, input_file: &Path) -> LinderaResult<UserDictionary> {
321 debug!("reading {:?}", input_file);
322
323 let mut rdr = csv::ReaderBuilder::new()
324 .has_headers(false)
325 .flexible(true)
326 .from_path(input_file)
327 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
328
329 let mut rows: Vec<StringRecord> = vec![];
330 for result in rdr.records() {
331 let record =
332 result.map_err(|err| LinderaErrorKind::Content.with_error(anyhow::anyhow!(err)))?;
333 rows.push(record);
334 }
335
336 let mut normalized_rows: Vec<Vec<String>> = normalize_rows(&rows);
337 normalized_rows.par_sort_by_key(|row| row.first().map(|s| s.to_string()));
338 let (words_idx_data, words_data) = build_words(&self.serializer, &normalized_rows, false)?;
339 let dict = build_prefix_dict(build_word_entry_map(&normalized_rows, false)?, false)?;
340
341 Ok(UserDictionary {
342 dict,
343 words_idx_data,
344 words_data,
345 })
346 }
347}
348
349fn write<W: Write>(buffer: &[u8], writer: &mut W) -> LinderaResult<()> {
350 writer
351 .write_all(buffer)
352 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
353
354 Ok(())
355}