lindera_dictionary_builder/
cost_matrix.rs

1use std::borrow::Cow;
2use std::fs::File;
3use std::io;
4use std::io::Write;
5use std::path::Path;
6use std::str::FromStr;
7
8use byteorder::{LittleEndian, WriteBytesExt};
9use derive_builder::Builder;
10use lindera_core::error::LinderaErrorKind;
11use lindera_core::LinderaResult;
12use lindera_decompress::Algorithm;
13use log::debug;
14
15use crate::utils::{compress_write, read_file_with_encoding};
16
17#[derive(Builder, Debug)]
18#[builder(name = "CostMatrixBuilderOptions")]
19#[builder(build_fn(name = "builder"))]
20pub struct CostMatrixBuilder {
21    #[builder(default = "\"UTF-8\".into()", setter(into))]
22    encoding: Cow<'static, str>,
23    #[builder(default = "Algorithm::Deflate")]
24    compress_algorithm: Algorithm,
25}
26
27impl CostMatrixBuilder {
28    pub fn build(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
29        let matrix_data_path = input_dir.join("matrix.def");
30        debug!("reading {:?}", matrix_data_path);
31        let matrix_data = read_file_with_encoding(&matrix_data_path, &self.encoding)?;
32
33        let mut lines = Vec::new();
34        for line in matrix_data.lines() {
35            let fields: Vec<i32> = line
36                .split_whitespace()
37                .map(i32::from_str)
38                .collect::<Result<_, _>>()
39                .map_err(|err| LinderaErrorKind::Parse.with_error(anyhow::anyhow!(err)))?;
40            lines.push(fields);
41        }
42        let mut lines_it = lines.into_iter();
43        let header = lines_it.next().ok_or_else(|| {
44            LinderaErrorKind::Content.with_error(anyhow::anyhow!("unknown error"))
45        })?;
46        let forward_size = header[0] as u32;
47        let backward_size = header[1] as u32;
48        let len = 2 + (forward_size * backward_size) as usize;
49        let mut costs = vec![i16::MAX; len];
50        costs[0] = forward_size as i16;
51        costs[1] = backward_size as i16;
52        for fields in lines_it {
53            let forward_id = fields[0] as u32;
54            let backward_id = fields[1] as u32;
55            let cost = fields[2] as u16;
56            costs[2 + (backward_id + forward_id * backward_size) as usize] = cost as i16;
57        }
58
59        let wtr_matrix_mtx_path = output_dir.join(Path::new("matrix.mtx"));
60        let mut wtr_matrix_mtx = io::BufWriter::new(
61            File::create(wtr_matrix_mtx_path)
62                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?,
63        );
64        let mut matrix_mtx_buffer = Vec::new();
65        for cost in costs {
66            matrix_mtx_buffer
67                .write_i16::<LittleEndian>(cost)
68                .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
69        }
70
71        compress_write(
72            &matrix_mtx_buffer,
73            self.compress_algorithm,
74            &mut wtr_matrix_mtx,
75        )?;
76
77        wtr_matrix_mtx
78            .flush()
79            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
80
81        Ok(())
82    }
83}