lindera_dictionary/
lib.rs1use std::borrow::Cow;
2use std::fs;
3use std::path::PathBuf;
4use std::str::FromStr;
5
6use serde::{Deserialize, Serialize};
7use strum::IntoEnumIterator;
8use strum_macros::EnumIter;
9
10use lindera_cc_cedict_builder::cc_cedict_builder::CcCedictBuilder;
11use lindera_core::character_definition::CharacterDefinitions;
12use lindera_core::connection::ConnectionCostMatrix;
13use lindera_core::dictionary::{Dictionary, UserDictionary};
14use lindera_core::dictionary_builder::DictionaryBuilder;
15use lindera_core::error::{LinderaError, LinderaErrorKind};
16use lindera_core::prefix_dict::PrefixDict;
17use lindera_core::unknown_dictionary::UnknownDictionary;
18use lindera_core::LinderaResult;
19use lindera_ipadic_builder::ipadic_builder::IpadicBuilder;
20use lindera_ipadic_neologd_builder::ipadic_neologd_builder::IpadicNeologdBuilder;
21use lindera_ko_dic_builder::ko_dic_builder::KoDicBuilder;
22use lindera_unidic_builder::unidic_builder::UnidicBuilder;
23
24#[derive(Debug, Clone, EnumIter, Deserialize, Serialize, PartialEq, Eq)]
25pub enum DictionaryKind {
26 #[serde(rename = "ipadic")]
27 IPADIC,
28 #[serde(rename = "ipadic-neologd")]
29 IPADICNEologd,
30 #[serde(rename = "unidic")]
31 UniDic,
32 #[serde(rename = "ko-dic")]
33 KoDic,
34 #[serde(rename = "cc-cedict")]
35 CcCedict,
36}
37
38impl DictionaryKind {
39 pub fn variants() -> Vec<DictionaryKind> {
40 DictionaryKind::iter().collect::<Vec<_>>()
41 }
42
43 pub fn contained_variants() -> Vec<DictionaryKind> {
44 DictionaryKind::variants()
45 .into_iter()
46 .filter(|kind| match kind {
47 DictionaryKind::IPADIC => cfg!(feature = "ipadic"),
48 DictionaryKind::IPADICNEologd => cfg!(feature = "ipadic-neologd"),
49 DictionaryKind::UniDic => cfg!(feature = "unidic"),
50 DictionaryKind::KoDic => cfg!(feature = "ko-dic"),
51 DictionaryKind::CcCedict => cfg!(feature = "cc-cedict"),
52 })
53 .collect::<Vec<_>>()
54 }
55
56 pub fn as_str(&self) -> &str {
57 match self {
58 DictionaryKind::IPADIC => "ipadic",
59 DictionaryKind::IPADICNEologd => "ipadic-neologd",
60 DictionaryKind::UniDic => "unidic",
61 DictionaryKind::KoDic => "ko-dic",
62 DictionaryKind::CcCedict => "cc-cedict",
63 }
64 }
65}
66
67impl FromStr for DictionaryKind {
68 type Err = LinderaError;
69 fn from_str(input: &str) -> Result<DictionaryKind, Self::Err> {
70 match input {
71 "ipadic" => Ok(DictionaryKind::IPADIC),
72 "ipadic-neologd" => Ok(DictionaryKind::IPADICNEologd),
73 "unidic" => Ok(DictionaryKind::UniDic),
74 "ko-dic" => Ok(DictionaryKind::KoDic),
75 "cc-cedict" => Ok(DictionaryKind::CcCedict),
76 _ => Err(LinderaErrorKind::DictionaryKindError
77 .with_error(anyhow::anyhow!("Invalid dictionary kind: {}", input))),
78 }
79 }
80}
81
82#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
89pub struct DictionaryConfig {
90 pub kind: Option<DictionaryKind>,
92 pub path: Option<PathBuf>,
94}
95
96#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
101pub struct UserDictionaryConfig {
102 pub path: PathBuf,
104 pub kind: Option<DictionaryKind>,
106}
107
108pub struct DictionaryBuilderResolver {}
109
110impl DictionaryBuilderResolver {
111 pub fn resolve_builder(
112 dictionary_type: DictionaryKind,
113 ) -> LinderaResult<Box<dyn DictionaryBuilder>> {
114 match dictionary_type {
115 DictionaryKind::IPADIC => Ok(Box::new(IpadicBuilder::new())),
116 DictionaryKind::IPADICNEologd => Ok(Box::new(IpadicNeologdBuilder::new())),
117 DictionaryKind::UniDic => Ok(Box::new(UnidicBuilder::new())),
118 DictionaryKind::KoDic => Ok(Box::new(KoDicBuilder::new())),
119 DictionaryKind::CcCedict => Ok(Box::new(CcCedictBuilder::new())),
120 }
121 }
122}
123
124pub struct DictionaryLoader {}
125
126impl DictionaryLoader {
127 fn read_file(path: PathBuf) -> LinderaResult<Vec<u8>> {
128 fs::read(path).map_err(|e| LinderaErrorKind::Io.with_error(e))
129 }
130
131 pub fn prefix_dict(dir: PathBuf) -> LinderaResult<PrefixDict> {
132 let unidic_data_path = dir.join("dict.da");
133 let unidic_data = Self::read_file(unidic_data_path)?;
134
135 let unidic_vals_path = dir.join("dict.vals");
136 let unidic_vals = Self::read_file(unidic_vals_path)?;
137
138 Ok(PrefixDict::from_static_slice(
139 unidic_data.as_slice(),
140 unidic_vals.as_slice(),
141 ))
142 }
143
144 pub fn connection(dir: PathBuf) -> LinderaResult<ConnectionCostMatrix> {
145 let path = dir.join("matrix.mtx");
146 let data = Self::read_file(path)?;
147
148 Ok(ConnectionCostMatrix::load(data.as_slice()))
149 }
150
151 pub fn char_def(dir: PathBuf) -> LinderaResult<CharacterDefinitions> {
152 let path = dir.join("char_def.bin");
153 let data = Self::read_file(path)?;
154
155 CharacterDefinitions::load(data.as_slice())
156 }
157
158 pub fn unknown_dict(dir: PathBuf) -> LinderaResult<UnknownDictionary> {
159 let path = dir.join("unk.bin");
160 let data = Self::read_file(path)?;
161
162 UnknownDictionary::load(data.as_slice())
163 }
164
165 pub fn words_idx_data(dir: PathBuf) -> LinderaResult<Vec<u8>> {
166 let path = dir.join("dict.wordsidx");
167 Self::read_file(path)
168 }
169
170 pub fn words_data(dir: PathBuf) -> LinderaResult<Vec<u8>> {
171 let path = dir.join("dict.words");
172 Self::read_file(path)
173 }
174
175 pub fn load_dictionary(path: PathBuf) -> LinderaResult<Dictionary> {
176 Ok(Dictionary {
177 dict: Self::prefix_dict(path.clone())?,
178 cost_matrix: Self::connection(path.clone())?,
179 char_definitions: Self::char_def(path.clone())?,
180 unknown_dictionary: Self::unknown_dict(path.clone())?,
181 words_idx_data: Cow::Owned(Self::words_idx_data(path.clone())?),
182 words_data: Cow::Owned(Self::words_data(path)?),
183 })
184 }
185
186 pub fn load_dictionary_from_kind(kind: DictionaryKind) -> LinderaResult<Dictionary> {
187 match kind {
189 #[cfg(feature = "ipadic")]
190 DictionaryKind::IPADIC => lindera_ipadic::load_dictionary()
191 .map_err(|e| LinderaErrorKind::DictionaryNotFound.with_error(e)),
192 #[cfg(feature = "ipadic-neologd")]
193 DictionaryKind::IPADICNEologd => lindera_ipadic_neologd::load_dictionary()
194 .map_err(|e| LinderaErrorKind::DictionaryNotFound.with_error(e)),
195 #[cfg(feature = "unidic")]
196 DictionaryKind::UniDic => lindera_unidic::load_dictionary()
197 .map_err(|e| LinderaErrorKind::DictionaryNotFound.with_error(e)),
198 #[cfg(feature = "ko-dic")]
199 DictionaryKind::KoDic => lindera_ko_dic::load_dictionary()
200 .map_err(|e| LinderaErrorKind::DictionaryNotFound.with_error(e)),
201 #[cfg(feature = "cc-cedict")]
202 DictionaryKind::CcCedict => lindera_cc_cedict::load_dictionary()
203 .map_err(|e| LinderaErrorKind::DictionaryNotFound.with_error(e)),
204 #[allow(unreachable_patterns)]
205 _ => Err(LinderaErrorKind::Args
206 .with_error(anyhow::anyhow!("Invalid dictionary type: {:?}", kind))),
207 }
208 }
209
210 pub fn load_dictionary_from_config(
211 dictionary_config: DictionaryConfig,
212 ) -> LinderaResult<Dictionary> {
213 match dictionary_config.kind {
214 Some(kind) => {
215 Self::load_dictionary_from_kind(kind)
217 }
218 None => {
219 match dictionary_config.path {
220 Some(path) => {
221 Self::load_dictionary(path)
223 }
224 None => Err(LinderaErrorKind::Args
225 .with_error(anyhow::anyhow!("Dictionary must be specified"))),
226 }
227 }
228 }
229 }
230
231 pub fn load_user_dictionary_from_csv(
232 kind: DictionaryKind,
233 path: PathBuf,
234 ) -> LinderaResult<UserDictionary> {
235 let builder = DictionaryBuilderResolver::resolve_builder(kind)?;
236 builder
237 .build_user_dict(path.as_path())
238 .map_err(|err| LinderaErrorKind::DictionaryBuildError.with_error(err))
239 }
240
241 pub fn load_user_dictionary_from_bin(path: PathBuf) -> LinderaResult<UserDictionary> {
242 UserDictionary::load(&Self::read_file(path)?)
243 }
244
245 pub fn load_user_dictionary_from_config(
246 dictionary_config: UserDictionaryConfig,
247 ) -> LinderaResult<UserDictionary> {
248 match dictionary_config.path.extension() {
249 Some(ext) => match ext.to_str() {
250 Some("csv") => match dictionary_config.kind {
251 Some(kind) => Self::load_user_dictionary_from_csv(kind, dictionary_config.path),
252 None => Err(LinderaErrorKind::Args.with_error(anyhow::anyhow!(
253 "Dictionary type must be specified if CSV file specified"
254 ))),
255 },
256 Some("bin") => Self::load_user_dictionary_from_bin(dictionary_config.path),
257 _ => Err(LinderaErrorKind::Args.with_error(anyhow::anyhow!(
258 "Invalid user dictionary source file extension"
259 ))),
260 },
261 None => Err(LinderaErrorKind::Args
262 .with_error(anyhow::anyhow!("Invalid user dictionary source file"))),
263 }
264 }
265}