lindera_dictionary/
builder.rs1pub mod character_definition;
2pub mod connection_cost_matrix;
3pub mod metadata;
4pub mod prefix_dictionary;
5pub mod unknown_dictionary;
6pub mod user_dictionary;
7
8use std::fs;
9use std::path::Path;
10
11use csv::StringRecord;
12
13use self::character_definition::CharacterDefinitionBuilderOptions;
14use self::connection_cost_matrix::ConnectionCostMatrixBuilderOptions;
15use self::metadata::MetadataBuilder;
16use self::prefix_dictionary::PrefixDictionaryBuilderOptions;
17use self::unknown_dictionary::UnknownDictionaryBuilderOptions;
18use self::user_dictionary::{UserDictionaryBuilderOptions, build_user_dictionary};
19use crate::LinderaResult;
20use crate::dictionary::UserDictionary;
21use crate::dictionary::character_definition::CharacterDefinition;
22use crate::dictionary::metadata::Metadata;
23use crate::error::LinderaErrorKind;
24
25#[derive(Clone)]
26pub struct DictionaryBuilder {
27 metadata: Metadata,
28}
29
30impl DictionaryBuilder {
31 pub fn new(metadata: Metadata) -> Self {
32 Self { metadata }
33 }
34
35 pub fn build_dictionary(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
36 fs::create_dir_all(output_dir)
37 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
38
39 self.build_metadata(output_dir)?;
40 let chardef = self.build_character_definition(input_dir, output_dir)?;
41 self.build_unknown_dictionary(input_dir, output_dir, &chardef)?;
42 self.build_prefix_dictionary(input_dir, output_dir)?;
43 self.build_connection_cost_matrix(input_dir, output_dir)?;
44
45 Ok(())
46 }
47
48 pub fn build_metadata(&self, output_dir: &Path) -> LinderaResult<()> {
49 MetadataBuilder::new().build(&self.metadata, output_dir)
50 }
51
52 pub fn build_character_definition(
53 &self,
54 input_dir: &Path,
55 output_dir: &Path,
56 ) -> LinderaResult<CharacterDefinition> {
57 CharacterDefinitionBuilderOptions::default()
58 .encoding(self.metadata.encoding.clone())
59 .compress_algorithm(self.metadata.compress_algorithm)
60 .builder()
61 .unwrap()
62 .build(input_dir, output_dir)
63 }
64
65 pub fn build_unknown_dictionary(
66 &self,
67 input_dir: &Path,
68 output_dir: &Path,
69 chardef: &CharacterDefinition,
70 ) -> LinderaResult<()> {
71 UnknownDictionaryBuilderOptions::default()
72 .encoding(self.metadata.encoding.clone())
73 .compress_algorithm(self.metadata.compress_algorithm)
74 .builder()
75 .unwrap()
76 .build(input_dir, chardef, output_dir)
77 }
78
79 pub fn build_prefix_dictionary(
80 &self,
81 input_dir: &Path,
82 output_dir: &Path,
83 ) -> LinderaResult<()> {
84 PrefixDictionaryBuilderOptions::default()
85 .flexible_csv(self.metadata.flexible_csv)
86 .encoding(self.metadata.encoding.clone())
87 .compress_algorithm(self.metadata.compress_algorithm)
88 .skip_invalid_cost_or_id(self.metadata.skip_invalid_cost_or_id)
89 .normalize_details(self.metadata.normalize_details)
90 .schema(self.metadata.dictionary_schema.clone())
91 .builder()
92 .unwrap()
93 .build(input_dir, output_dir)
94 }
95
96 pub fn build_connection_cost_matrix(
97 &self,
98 input_dir: &Path,
99 output_dir: &Path,
100 ) -> LinderaResult<()> {
101 ConnectionCostMatrixBuilderOptions::default()
102 .encoding(self.metadata.encoding.clone())
103 .compress_algorithm(self.metadata.compress_algorithm)
104 .builder()
105 .unwrap()
106 .build(input_dir, output_dir)
107 }
108
109 pub fn build_user_dictionary(
110 &self,
111 input_file: &Path,
112 output_file: &Path,
113 ) -> LinderaResult<()> {
114 let user_dict = self.build_user_dict(input_file)?;
115 build_user_dictionary(user_dict, output_file)
116 }
117
118 pub fn build_user_dict(&self, input_file: &Path) -> LinderaResult<UserDictionary> {
119 let userdic_schema = self.metadata.user_dictionary_schema.clone();
120 let dict_schema = self.metadata.dictionary_schema.clone();
121 let default_field_value = self.metadata.default_field_value.clone();
122
123 UserDictionaryBuilderOptions::default()
124 .user_dictionary_fields_num(self.metadata.user_dictionary_schema.field_count())
125 .dictionary_fields_num(self.metadata.dictionary_schema.field_count())
126 .default_word_cost(self.metadata.default_word_cost)
127 .default_left_context_id(self.metadata.default_left_context_id)
128 .default_right_context_id(self.metadata.default_right_context_id)
129 .flexible_csv(self.metadata.flexible_csv)
130 .user_dictionary_handler(Some(Box::new(move |row: &StringRecord| {
131 let mut result = Vec::new();
133
134 for field_name in dict_schema.get_custom_fields() {
136 if let Some(idx) = userdic_schema.get_field_index(field_name) {
137 if idx < row.len() {
139 result.push(row[idx].to_string());
140 } else {
141 result.push(default_field_value.clone());
142 }
143 } else {
144 result.push(default_field_value.clone());
146 }
147 }
148
149 Ok(result)
150 })))
151 .builder()
152 .unwrap()
153 .build(input_file)
154 }
155}