jpreprocess_dictionary_builder/
ipadic_builder.rs

1use std::{
2    fs::{self, File},
3    io::{self, Write},
4    path::{Path, PathBuf},
5    str::FromStr,
6};
7
8use jpreprocess_dictionary::{serializer::lindera::LinderaSerializer, DictionarySerializer};
9use rayon::prelude::*;
10
11use byteorder::{LittleEndian, WriteBytesExt};
12use csv::StringRecord;
13use glob::glob;
14use log::debug;
15
16use lindera_core::{
17    character_definition::{CharacterDefinitions, CharacterDefinitionsBuilder},
18    dictionary::UserDictionary,
19    dictionary_builder::DictionaryBuilder,
20    error::LinderaErrorKind,
21    file_util::read_utf8_file,
22    unknown_dictionary::parse_unk,
23    LinderaResult,
24};
25
26use crate::build_dict::*;
27
28pub struct IpadicBuilder {
29    serializer: Box<dyn DictionarySerializer + Send + Sync>,
30}
31
32impl IpadicBuilder {
33    const UNK_FIELDS_NUM: usize = 11;
34
35    pub fn new(serializer: Box<dyn Send + Sync + DictionarySerializer>) -> Self {
36        IpadicBuilder { serializer }
37    }
38
39    fn write_words(
40        &self,
41        wtr_words_path: &Path,
42        wtr_words_idx_path: &Path,
43        is_system: bool,
44        normalized_rows: &Vec<Vec<String>>,
45    ) -> Result<(), lindera_core::error::LinderaError> {
46        let mut wtr_words = io::BufWriter::new(
47            File::create(wtr_words_path)
48                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
49        );
50        let mut wtr_words_idx = io::BufWriter::new(
51            File::create(wtr_words_idx_path)
52                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
53        );
54
55        let (words_idx_buffer, words_buffer) =
56            build_words(&self.serializer, normalized_rows, is_system)?;
57
58        write(&words_buffer, &mut wtr_words)?;
59        write(&words_idx_buffer, &mut wtr_words_idx)?;
60        wtr_words
61            .flush()
62            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
63        wtr_words_idx
64            .flush()
65            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
66        Ok(())
67    }
68
69    pub fn build_user_dict_from_data(
70        &self,
71        rows: &Vec<Vec<&str>>,
72    ) -> LinderaResult<UserDictionary> {
73        let mut normalized_rows: Vec<Vec<String>> = normalize_rows(rows);
74        normalized_rows.par_sort_by_key(|row| row.first().map(|s| s.to_string()));
75        let (words_idx_data, words_data) = build_words(&self.serializer, &normalized_rows, false)?;
76        let dict = build_prefix_dict(build_word_entry_map(&normalized_rows, false)?, false)?;
77        Ok(UserDictionary {
78            dict,
79            words_idx_data,
80            words_data,
81        })
82    }
83}
84
85impl Default for IpadicBuilder {
86    fn default() -> Self {
87        Self::new(Box::new(LinderaSerializer))
88    }
89}
90
91impl DictionaryBuilder for IpadicBuilder {
92    fn build_dictionary(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
93        fs::create_dir_all(output_dir)
94            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
95
96        let chardef = self.build_chardef(input_dir, output_dir)?;
97        self.build_unk(input_dir, &chardef, output_dir)?;
98        self.build_dict(input_dir, output_dir)?;
99        self.build_cost_matrix(input_dir, output_dir)?;
100
101        Ok(())
102    }
103
104    fn build_user_dictionary(&self, input_file: &Path, output_file: &Path) -> LinderaResult<()> {
105        let parent_dir = match output_file.parent() {
106            Some(parent_dir) => parent_dir,
107            None => {
108                return Err(LinderaErrorKind::Io.with_error(anyhow::anyhow!(
109                    "failed to get parent directory of output file"
110                )))
111            }
112        };
113        fs::create_dir_all(parent_dir)
114            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
115
116        let user_dict = self.build_user_dict(input_file)?;
117
118        let mut wtr = io::BufWriter::new(
119            File::create(output_file)
120                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
121        );
122        bincode::serialize_into(&mut wtr, &user_dict)
123            .map_err(|err| LinderaErrorKind::Serialize.with_error(anyhow::anyhow!(err)))?;
124        wtr.flush()
125            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
126
127        Ok(())
128    }
129
130    fn build_chardef(
131        &self,
132        input_dir: &Path,
133        output_dir: &Path,
134    ) -> LinderaResult<CharacterDefinitions> {
135        let char_def_path = input_dir.join("char.def");
136        debug!("reading {:?}", char_def_path);
137
138        let char_def = read_utf8_file(&char_def_path)?;
139        let mut char_definitions_builder = CharacterDefinitionsBuilder::default();
140        char_definitions_builder.parse(&char_def)?;
141        let char_definitions = char_definitions_builder.build();
142
143        let mut chardef_buffer = Vec::new();
144        bincode::serialize_into(&mut chardef_buffer, &char_definitions)
145            .map_err(|err| LinderaErrorKind::Serialize.with_error(anyhow::anyhow!(err)))?;
146
147        let wtr_chardef_path = output_dir.join(Path::new("char_def.bin"));
148        let mut wtr_chardef = io::BufWriter::new(
149            File::create(wtr_chardef_path)
150                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
151        );
152
153        write(&chardef_buffer, &mut wtr_chardef)?;
154
155        wtr_chardef
156            .flush()
157            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
158
159        Ok(char_definitions)
160    }
161
162    fn build_unk(
163        &self,
164        input_dir: &Path,
165        chardef: &CharacterDefinitions,
166        output_dir: &Path,
167    ) -> LinderaResult<()> {
168        let unk_data_path = input_dir.join("unk.def");
169        debug!("reading {:?}", unk_data_path);
170
171        let unk_data = read_utf8_file(&unk_data_path)?;
172        let unknown_dictionary = parse_unk(chardef.categories(), &unk_data, Self::UNK_FIELDS_NUM)?;
173
174        let mut unk_buffer = Vec::new();
175        bincode::serialize_into(&mut unk_buffer, &unknown_dictionary)
176            .map_err(|err| LinderaErrorKind::Serialize.with_error(anyhow::anyhow!(err)))?;
177
178        let wtr_unk_path = output_dir.join(Path::new("unk.bin"));
179        let mut wtr_unk = io::BufWriter::new(
180            File::create(wtr_unk_path)
181                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
182        );
183        write(&unk_buffer, &mut wtr_unk)?;
184        wtr_unk
185            .flush()
186            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
187
188        Ok(())
189    }
190
191    fn build_dict(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
192        let pattern = if let Some(path) = input_dir.to_str() {
193            format!("{}/*.csv", path)
194        } else {
195            return Err(
196                LinderaErrorKind::Io.with_error(anyhow::anyhow!("Failed to convert path to &str."))
197            );
198        };
199
200        let mut filenames: Vec<PathBuf> = Vec::new();
201        for entry in
202            glob(&pattern).map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?
203        {
204            match entry {
205                Ok(path) => {
206                    if let Some(filename) = path.file_name() {
207                        filenames.push(Path::new(input_dir).join(filename));
208                    } else {
209                        return Err(LinderaErrorKind::Io
210                            .with_error(anyhow::anyhow!("failed to get filename")));
211                    }
212                }
213                Err(err) => return Err(LinderaErrorKind::Content.with_error(anyhow::anyhow!(err))),
214            }
215        }
216
217        let mut rows: Vec<StringRecord> = vec![];
218        for filename in filenames {
219            debug!("reading {:?}", filename);
220
221            let mut rdr = csv::ReaderBuilder::new()
222                .has_headers(false)
223                .from_path(filename)
224                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
225
226            for result in rdr.records() {
227                let record = result
228                    .map_err(|err| LinderaErrorKind::Content.with_error(anyhow::anyhow!(err)))?;
229                rows.push(record);
230            }
231        }
232
233        let mut normalized_rows: Vec<Vec<String>> = normalize_rows(&rows);
234
235        normalized_rows.par_sort_by_key(|row| row.first().map(|s| s.to_string()));
236
237        let wtr_da_path = output_dir.join(Path::new("dict.da"));
238        let mut wtr_da = io::BufWriter::new(
239            File::create(wtr_da_path)
240                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
241        );
242
243        let wtr_vals_path = output_dir.join(Path::new("dict.vals"));
244        let mut wtr_vals = io::BufWriter::new(
245            File::create(wtr_vals_path)
246                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
247        );
248
249        self.write_words(
250            output_dir.join(Path::new("dict.words")).as_path(),
251            output_dir.join(Path::new("dict.wordsidx")).as_path(),
252            true,
253            &normalized_rows,
254        )?;
255
256        let prefix_dict = build_prefix_dict(build_word_entry_map(&normalized_rows, true)?, true)?;
257
258        write(&prefix_dict.da.0, &mut wtr_da)?;
259
260        write(&prefix_dict.vals_data, &mut wtr_vals)?;
261
262        wtr_vals
263            .flush()
264            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
265
266        Ok(())
267    }
268
269    fn build_cost_matrix(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
270        let matrix_data_path = input_dir.join("matrix.def");
271        debug!("reading {:?}", matrix_data_path);
272
273        let matrix_data = read_utf8_file(&matrix_data_path)?;
274        let mut lines_it = matrix_data
275            .par_lines()
276            .map(|line| {
277                line.split_whitespace()
278                    .map(i32::from_str)
279                    .collect::<Result<Vec<i32>, _>>()
280                    .map_err(|err| LinderaErrorKind::Parse.with_error(anyhow::anyhow!(err)))
281            })
282            .collect::<Result<Vec<_>, _>>()?
283            .into_iter();
284        let header = lines_it.next().ok_or_else(|| {
285            LinderaErrorKind::Content.with_error(anyhow::anyhow!("unknown error"))
286        })?;
287        let forward_size = header[0] as u32;
288        let backward_size = header[1] as u32;
289        let len = 2 + (forward_size * backward_size) as usize;
290        let mut costs = vec![i16::MAX; len];
291        costs[0] = forward_size as i16;
292        costs[1] = backward_size as i16;
293        for fields in lines_it {
294            let forward_id = fields[0] as u32;
295            let backward_id = fields[1] as u32;
296            let cost = fields[2] as u16;
297            costs[2 + (backward_id + forward_id * backward_size) as usize] = cost as i16;
298        }
299
300        let wtr_matrix_mtx_path = output_dir.join(Path::new("matrix.mtx"));
301        let mut wtr_matrix_mtx = io::BufWriter::new(
302            File::create(wtr_matrix_mtx_path)
303                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
304        );
305        let mut matrix_mtx_buffer = Vec::new();
306        for cost in costs {
307            matrix_mtx_buffer
308                .write_i16::<LittleEndian>(cost)
309                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
310        }
311        write(&matrix_mtx_buffer, &mut wtr_matrix_mtx)?;
312
313        wtr_matrix_mtx
314            .flush()
315            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
316
317        Ok(())
318    }
319
320    fn build_user_dict(&self, input_file: &Path) -> LinderaResult<UserDictionary> {
321        debug!("reading {:?}", input_file);
322
323        let mut rdr = csv::ReaderBuilder::new()
324            .has_headers(false)
325            .flexible(true)
326            .from_path(input_file)
327            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
328
329        let mut rows: Vec<StringRecord> = vec![];
330        for result in rdr.records() {
331            let record =
332                result.map_err(|err| LinderaErrorKind::Content.with_error(anyhow::anyhow!(err)))?;
333            rows.push(record);
334        }
335
336        let mut normalized_rows: Vec<Vec<String>> = normalize_rows(&rows);
337        normalized_rows.par_sort_by_key(|row| row.first().map(|s| s.to_string()));
338        let (words_idx_data, words_data) = build_words(&self.serializer, &normalized_rows, false)?;
339        let dict = build_prefix_dict(build_word_entry_map(&normalized_rows, false)?, false)?;
340
341        Ok(UserDictionary {
342            dict,
343            words_idx_data,
344            words_data,
345        })
346    }
347}
348
349fn write<W: Write>(buffer: &[u8], writer: &mut W) -> LinderaResult<()> {
350    writer
351        .write_all(buffer)
352        .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
353
354    Ok(())
355}