Skip to main content

lindera_dictionary/
builder.rs

1pub mod character_definition;
2pub mod connection_cost_matrix;
3pub mod metadata;
4pub mod prefix_dictionary;
5pub mod unknown_dictionary;
6pub mod user_dictionary;
7
8use std::fs;
9use std::path::Path;
10
11use csv::StringRecord;
12
13use self::character_definition::CharacterDefinitionBuilderOptions;
14use self::connection_cost_matrix::ConnectionCostMatrixBuilderOptions;
15use self::metadata::MetadataBuilder;
16use self::prefix_dictionary::PrefixDictionaryBuilderOptions;
17use self::unknown_dictionary::UnknownDictionaryBuilderOptions;
18use self::user_dictionary::{UserDictionaryBuilderOptions, build_user_dictionary};
19use crate::LinderaResult;
20use crate::dictionary::UserDictionary;
21use crate::dictionary::character_definition::CharacterDefinition;
22use crate::dictionary::metadata::Metadata;
23use crate::error::LinderaErrorKind;
24
25#[derive(Clone)]
26pub struct DictionaryBuilder {
27    metadata: Metadata,
28}
29
30impl DictionaryBuilder {
31    pub fn new(metadata: Metadata) -> Self {
32        Self { metadata }
33    }
34
35    pub fn build_dictionary(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
36        fs::create_dir_all(output_dir)
37            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
38
39        self.build_metadata(output_dir)?;
40        let chardef = self.build_character_definition(input_dir, output_dir)?;
41        self.build_unknown_dictionary(input_dir, output_dir, &chardef)?;
42        self.build_prefix_dictionary(input_dir, output_dir)?;
43        self.build_connection_cost_matrix(input_dir, output_dir)?;
44
45        Ok(())
46    }
47
48    pub fn build_metadata(&self, output_dir: &Path) -> LinderaResult<()> {
49        MetadataBuilder::new().build(&self.metadata, output_dir)
50    }
51
52    pub fn build_character_definition(
53        &self,
54        input_dir: &Path,
55        output_dir: &Path,
56    ) -> LinderaResult<CharacterDefinition> {
57        CharacterDefinitionBuilderOptions::default()
58            .encoding(self.metadata.encoding.clone())
59            .builder()
60            .unwrap()
61            .build(input_dir, output_dir)
62    }
63
64    pub fn build_unknown_dictionary(
65        &self,
66        input_dir: &Path,
67        output_dir: &Path,
68        chardef: &CharacterDefinition,
69    ) -> LinderaResult<()> {
70        UnknownDictionaryBuilderOptions::default()
71            .encoding(self.metadata.encoding.clone())
72            .builder()
73            .unwrap()
74            .build(input_dir, chardef, output_dir)
75    }
76
77    pub fn build_prefix_dictionary(
78        &self,
79        input_dir: &Path,
80        output_dir: &Path,
81    ) -> LinderaResult<()> {
82        PrefixDictionaryBuilderOptions::default()
83            .flexible_csv(self.metadata.flexible_csv)
84            .encoding(self.metadata.encoding.clone())
85            .skip_invalid_cost_or_id(self.metadata.skip_invalid_cost_or_id)
86            .normalize_details(self.metadata.normalize_details)
87            .schema(self.metadata.dictionary_schema.clone())
88            .builder()
89            .unwrap()
90            .build(input_dir, output_dir)
91    }
92
93    pub fn build_connection_cost_matrix(
94        &self,
95        input_dir: &Path,
96        output_dir: &Path,
97    ) -> LinderaResult<()> {
98        ConnectionCostMatrixBuilderOptions::default()
99            .encoding(self.metadata.encoding.clone())
100            .builder()
101            .unwrap()
102            .build(input_dir, output_dir)
103    }
104
105    pub fn build_user_dictionary(
106        &self,
107        input_file: &Path,
108        output_file: &Path,
109    ) -> LinderaResult<()> {
110        let user_dict = self.build_user_dict(input_file)?;
111        build_user_dictionary(user_dict, output_file)
112    }
113
114    pub fn build_user_dict(&self, input_file: &Path) -> LinderaResult<UserDictionary> {
115        let userdic_schema = self.metadata.user_dictionary_schema.clone();
116        let dict_schema = self.metadata.dictionary_schema.clone();
117        let default_field_value = self.metadata.default_field_value.clone();
118
119        UserDictionaryBuilderOptions::default()
120            .user_dictionary_fields_num(self.metadata.user_dictionary_schema.field_count())
121            .dictionary_fields_num(self.metadata.dictionary_schema.field_count())
122            .default_word_cost(self.metadata.default_word_cost)
123            .default_left_context_id(self.metadata.default_left_context_id)
124            .default_right_context_id(self.metadata.default_right_context_id)
125            .flexible_csv(self.metadata.flexible_csv)
126            .user_dictionary_handler(Some(Box::new(move |row: &StringRecord| {
127                // Map user dictionary fields to dictionary schema fields
128                let mut result = Vec::new();
129
130                // Skip the first 4 common fields (surface, left_id, right_id, cost)
131                for field_name in dict_schema.get_custom_fields() {
132                    if let Some(idx) = userdic_schema.get_field_index(field_name) {
133                        // If field exists in user dictionary schema, get value from CSV
134                        if idx < row.len() {
135                            result.push(row[idx].to_string());
136                        } else {
137                            result.push(default_field_value.clone());
138                        }
139                    } else {
140                        // Field not in user dictionary schema, use default value
141                        result.push(default_field_value.clone());
142                    }
143                }
144
145                Ok(result)
146            })))
147            .builder()
148            .unwrap()
149            .build(input_file)
150    }
151}