lindera_dictionary/
builder.rs1pub mod character_definition;
2pub mod connection_cost_matrix;
3pub mod metadata;
4pub mod prefix_dictionary;
5pub mod unknown_dictionary;
6pub mod user_dictionary;
7
8use std::fs;
9use std::path::Path;
10
11use csv::StringRecord;
12
13use self::character_definition::CharacterDefinitionBuilderOptions;
14use self::connection_cost_matrix::ConnectionCostMatrixBuilderOptions;
15use self::metadata::MetadataBuilder;
16use self::prefix_dictionary::PrefixDictionaryBuilderOptions;
17use self::unknown_dictionary::UnknownDictionaryBuilderOptions;
18use self::user_dictionary::{UserDictionaryBuilderOptions, build_user_dictionary};
19use crate::LinderaResult;
20use crate::dictionary::UserDictionary;
21use crate::dictionary::character_definition::CharacterDefinition;
22use crate::dictionary::metadata::Metadata;
23use crate::error::LinderaErrorKind;
24
25#[derive(Clone)]
26pub struct DictionaryBuilder {
27 metadata: Metadata,
28}
29
30impl DictionaryBuilder {
31 pub fn new(metadata: Metadata) -> Self {
32 Self { metadata }
33 }
34
35 pub fn build_dictionary(&self, input_dir: &Path, output_dir: &Path) -> LinderaResult<()> {
36 fs::create_dir_all(output_dir)
37 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
38
39 self.build_metadata(output_dir)?;
40 let chardef = self.build_character_definition(input_dir, output_dir)?;
41 self.build_unknown_dictionary(input_dir, output_dir, &chardef)?;
42 self.build_prefix_dictionary(input_dir, output_dir)?;
43 self.build_connection_cost_matrix(input_dir, output_dir)?;
44
45 Ok(())
46 }
47
48 pub fn build_metadata(&self, output_dir: &Path) -> LinderaResult<()> {
49 MetadataBuilder::new().build(&self.metadata, output_dir)
50 }
51
52 pub fn build_character_definition(
53 &self,
54 input_dir: &Path,
55 output_dir: &Path,
56 ) -> LinderaResult<CharacterDefinition> {
57 CharacterDefinitionBuilderOptions::default()
58 .encoding(self.metadata.encoding.clone())
59 .builder()
60 .unwrap()
61 .build(input_dir, output_dir)
62 }
63
64 pub fn build_unknown_dictionary(
65 &self,
66 input_dir: &Path,
67 output_dir: &Path,
68 chardef: &CharacterDefinition,
69 ) -> LinderaResult<()> {
70 UnknownDictionaryBuilderOptions::default()
71 .encoding(self.metadata.encoding.clone())
72 .builder()
73 .unwrap()
74 .build(input_dir, chardef, output_dir)
75 }
76
77 pub fn build_prefix_dictionary(
78 &self,
79 input_dir: &Path,
80 output_dir: &Path,
81 ) -> LinderaResult<()> {
82 PrefixDictionaryBuilderOptions::default()
83 .flexible_csv(self.metadata.flexible_csv)
84 .encoding(self.metadata.encoding.clone())
85 .skip_invalid_cost_or_id(self.metadata.skip_invalid_cost_or_id)
86 .normalize_details(self.metadata.normalize_details)
87 .schema(self.metadata.dictionary_schema.clone())
88 .builder()
89 .unwrap()
90 .build(input_dir, output_dir)
91 }
92
93 pub fn build_connection_cost_matrix(
94 &self,
95 input_dir: &Path,
96 output_dir: &Path,
97 ) -> LinderaResult<()> {
98 ConnectionCostMatrixBuilderOptions::default()
99 .encoding(self.metadata.encoding.clone())
100 .builder()
101 .unwrap()
102 .build(input_dir, output_dir)
103 }
104
105 pub fn build_user_dictionary(
106 &self,
107 input_file: &Path,
108 output_file: &Path,
109 ) -> LinderaResult<()> {
110 let user_dict = self.build_user_dict(input_file)?;
111 build_user_dictionary(user_dict, output_file)
112 }
113
114 pub fn build_user_dict(&self, input_file: &Path) -> LinderaResult<UserDictionary> {
115 let userdic_schema = self.metadata.user_dictionary_schema.clone();
116 let dict_schema = self.metadata.dictionary_schema.clone();
117 let default_field_value = self.metadata.default_field_value.clone();
118
119 UserDictionaryBuilderOptions::default()
120 .user_dictionary_fields_num(self.metadata.user_dictionary_schema.field_count())
121 .dictionary_fields_num(self.metadata.dictionary_schema.field_count())
122 .default_word_cost(self.metadata.default_word_cost)
123 .default_left_context_id(self.metadata.default_left_context_id)
124 .default_right_context_id(self.metadata.default_right_context_id)
125 .flexible_csv(self.metadata.flexible_csv)
126 .user_dictionary_handler(Some(Box::new(move |row: &StringRecord| {
127 let mut result = Vec::new();
129
130 for field_name in dict_schema.get_custom_fields() {
132 if let Some(idx) = userdic_schema.get_field_index(field_name) {
133 if idx < row.len() {
135 result.push(row[idx].to_string());
136 } else {
137 result.push(default_field_value.clone());
138 }
139 } else {
140 result.push(default_field_value.clone());
142 }
143 }
144
145 Ok(result)
146 })))
147 .builder()
148 .unwrap()
149 .build(input_file)
150 }
151}