lindera_dictionary/
builder.rs

1pub mod character_definition;
2pub mod connection_cost_matrix;
3pub mod metadata;
4pub mod prefix_dictionary;
5pub mod unknown_dictionary;
6pub mod user_dictionary;
7
8use std::fs;
9use std::path::Path;
10
11use csv::StringRecord;
12
13use self::character_definition::CharacterDefinitionBuilderOptions;
14use self::connection_cost_matrix::ConnectionCostMatrixBuilderOptions;
15use self::metadata::MetadataBuilder;
16use self::prefix_dictionary::PrefixDictionaryBuilderOptions;
17use self::unknown_dictionary::UnknownDictionaryBuilderOptions;
18use self::user_dictionary::{UserDictionaryBuilderOptions, build_user_dictionary};
19use crate::LinderaResult;
20use crate::dictionary::UserDictionary;
21use crate::dictionary::character_definition::CharacterDefinition;
22use crate::dictionary::metadata::Metadata;
23use crate::error::LinderaErrorKind;
24
25#[derive(Clone)]
26pub struct DictionaryBuilder {
27    metadata: Metadata,
28}
29
30impl DictionaryBuilder {
31    pub fn new(metadata: Metadata) -> Self {
32        Self { metadata }
33    }
34
35    pub fn build_dictionary(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
36        fs::create_dir_all(output_dir)
37            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
38
39        self.build_metadata(output_dir)?;
40        let chardef = self.build_character_definition(input_dir, output_dir)?;
41        self.build_unknown_dictionary(input_dir, output_dir, &chardef)?;
42        self.build_prefix_dictionary(input_dir, output_dir)?;
43        self.build_connection_cost_matrix(input_dir, output_dir)?;
44
45        Ok(())
46    }
47
48    pub fn build_metadata(&self, output_dir: &Path) -> LinderaResult<()> {
49        MetadataBuilder::new().build(&self.metadata, output_dir)
50    }
51
52    pub fn build_character_definition(
53        &self,
54        input_dir: &Path,
55        output_dir: &Path,
56    ) -> LinderaResult<CharacterDefinition> {
57        CharacterDefinitionBuilderOptions::default()
58            .encoding(self.metadata.encoding.clone())
59            .compress_algorithm(self.metadata.compress_algorithm)
60            .builder()
61            .unwrap()
62            .build(input_dir, output_dir)
63    }
64
65    pub fn build_unknown_dictionary(
66        &self,
67        input_dir: &Path,
68        output_dir: &Path,
69        chardef: &CharacterDefinition,
70    ) -> LinderaResult<()> {
71        UnknownDictionaryBuilderOptions::default()
72            .encoding(self.metadata.encoding.clone())
73            .compress_algorithm(self.metadata.compress_algorithm)
74            .builder()
75            .unwrap()
76            .build(input_dir, chardef, output_dir)
77    }
78
79    pub fn build_prefix_dictionary(
80        &self,
81        input_dir: &Path,
82        output_dir: &Path,
83    ) -> LinderaResult<()> {
84        PrefixDictionaryBuilderOptions::default()
85            .flexible_csv(self.metadata.flexible_csv)
86            .encoding(self.metadata.encoding.clone())
87            .compress_algorithm(self.metadata.compress_algorithm)
88            .skip_invalid_cost_or_id(self.metadata.skip_invalid_cost_or_id)
89            .normalize_details(self.metadata.normalize_details)
90            .schema(self.metadata.dictionary_schema.clone())
91            .builder()
92            .unwrap()
93            .build(input_dir, output_dir)
94    }
95
96    pub fn build_connection_cost_matrix(
97        &self,
98        input_dir: &Path,
99        output_dir: &Path,
100    ) -> LinderaResult<()> {
101        ConnectionCostMatrixBuilderOptions::default()
102            .encoding(self.metadata.encoding.clone())
103            .compress_algorithm(self.metadata.compress_algorithm)
104            .builder()
105            .unwrap()
106            .build(input_dir, output_dir)
107    }
108
109    pub fn build_user_dictionary(
110        &self,
111        input_file: &Path,
112        output_file: &Path,
113    ) -> LinderaResult<()> {
114        let user_dict = self.build_user_dict(input_file)?;
115        build_user_dictionary(user_dict, output_file)
116    }
117
118    pub fn build_user_dict(&self, input_file: &Path) -> LinderaResult<UserDictionary> {
119        let userdic_schema = self.metadata.user_dictionary_schema.clone();
120        let dict_schema = self.metadata.dictionary_schema.clone();
121        let default_field_value = self.metadata.default_field_value.clone();
122
123        UserDictionaryBuilderOptions::default()
124            .user_dictionary_fields_num(self.metadata.user_dictionary_schema.field_count())
125            .dictionary_fields_num(self.metadata.dictionary_schema.field_count())
126            .default_word_cost(self.metadata.default_word_cost)
127            .default_left_context_id(self.metadata.default_left_context_id)
128            .default_right_context_id(self.metadata.default_right_context_id)
129            .flexible_csv(self.metadata.flexible_csv)
130            .user_dictionary_handler(Some(Box::new(move |row: &StringRecord| {
131                // Map user dictionary fields to dictionary schema fields
132                let mut result = Vec::new();
133
134                // Skip the first 4 common fields (surface, left_id, right_id, cost)
135                for field_name in dict_schema.get_custom_fields() {
136                    if let Some(idx) = userdic_schema.get_field_index(field_name) {
137                        // If field exists in user dictionary schema, get value from CSV
138                        if idx < row.len() {
139                            result.push(row[idx].to_string());
140                        } else {
141                            result.push(default_field_value.clone());
142                        }
143                    } else {
144                        // Field not in user dictionary schema, use default value
145                        result.push(default_field_value.clone());
146                    }
147                }
148
149                Ok(result)
150            })))
151            .builder()
152            .unwrap()
153            .build(input_file)
154    }
155}