lindera_ipadic_builder/
ipadic_builder.rs

1use std::{fs, path::Path};
2
3use lindera_core::{
4    character_definition::CharacterDefinitions, dictionary::UserDictionary,
5    dictionary_builder::DictionaryBuilder, error::LinderaErrorKind, LinderaResult,
6};
7use lindera_decompress::Algorithm;
8use lindera_dictionary_builder::{
9    build_user_dictionary, CharDefBuilderOptions, CostMatrixBuilderOptions, DictBuilderOptions,
10    UnkBuilderOptions, UserDictBuilderOptions,
11};
12
13const SIMPLE_USERDIC_FIELDS_NUM: usize = 3;
14const SIMPLE_WORD_COST: i16 = -10000;
15const SIMPLE_CONTEXT_ID: u16 = 0;
16const DETAILED_USERDIC_FIELDS_NUM: usize = 13;
17const COMPRESS_ALGORITHM: Algorithm = Algorithm::Deflate;
18const UNK_FIELDS_NUM: usize = 11;
19const ENCODING: &'static str = "EUC-JP";
20
21pub struct IpadicBuilder {}
22
23impl IpadicBuilder {
24    pub fn new() -> Self {
25        IpadicBuilder {}
26    }
27}
28
29impl Default for IpadicBuilder {
30    fn default() -> Self {
31        Self::new()
32    }
33}
34
35impl DictionaryBuilder for IpadicBuilder {
36    fn build_dictionary(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
37        fs::create_dir_all(output_dir)
38            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
39
40        let chardef = self.build_chardef(input_dir, output_dir)?;
41        self.build_unk(input_dir, &chardef, output_dir)?;
42        self.build_dict(input_dir, output_dir)?;
43        self.build_cost_matrix(input_dir, output_dir)?;
44
45        Ok(())
46    }
47
48    fn build_user_dictionary(&self, input_file: &Path, output_file: &Path) -> LinderaResult<()> {
49        let user_dict = self.build_user_dict(input_file)?;
50        build_user_dictionary(user_dict, output_file)
51    }
52
53    fn build_chardef(
54        &self,
55        input_dir: &Path,
56        output_dir: &Path,
57    ) -> LinderaResult<CharacterDefinitions> {
58        CharDefBuilderOptions::default()
59            .encoding(ENCODING)
60            .compress_algorithm(COMPRESS_ALGORITHM)
61            .builder()
62            .unwrap()
63            .build(input_dir, output_dir)
64    }
65
66    fn build_unk(
67        &self,
68        input_dir: &Path,
69        chardef: &CharacterDefinitions,
70        output_dir: &Path,
71    ) -> LinderaResult<()> {
72        UnkBuilderOptions::default()
73            .encoding(ENCODING)
74            .compress_algorithm(COMPRESS_ALGORITHM)
75            .unk_fields_num(UNK_FIELDS_NUM)
76            .builder()
77            .unwrap()
78            .build(input_dir, chardef, output_dir)
79    }
80
81    fn build_dict(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
82        DictBuilderOptions::default()
83            .flexible_csv(false)
84            .encoding(ENCODING)
85            .compress_algorithm(COMPRESS_ALGORITHM)
86            .normalize_details(true)
87            .builder()
88            .unwrap()
89            .build(input_dir, output_dir)
90    }
91
92    fn build_cost_matrix(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
93        CostMatrixBuilderOptions::default()
94            .encoding(ENCODING)
95            .compress_algorithm(COMPRESS_ALGORITHM)
96            .builder()
97            .unwrap()
98            .build(&input_dir, output_dir)
99    }
100
101    fn build_user_dict(&self, input_file: &Path) -> LinderaResult<UserDictionary> {
102        UserDictBuilderOptions::default()
103            .simple_userdic_fields_num(SIMPLE_USERDIC_FIELDS_NUM)
104            .detailed_userdic_fields_num(DETAILED_USERDIC_FIELDS_NUM)
105            .simple_word_cost(SIMPLE_WORD_COST)
106            .simple_context_id(SIMPLE_CONTEXT_ID)
107            .flexible_csv(true)
108            .simple_userdic_details_handler(Box::new(|row| {
109                Ok(vec![
110                    row[1].to_string(), // POS
111                    "*".to_string(),    // POS subcategory 1
112                    "*".to_string(),    // POS subcategory 2
113                    "*".to_string(),    // POS subcategory 3
114                    "*".to_string(),    // Conjugation type
115                    "*".to_string(),    // Conjugation form
116                    row[0].to_string(), // Base form
117                    row[2].to_string(), // Reading
118                    "*".to_string(),    // Pronunciation
119                ])
120            }))
121            .builder()
122            .unwrap()
123            .build(input_file)
124    }
125}