lindera_dictionary/
lib.rs

1use std::borrow::Cow;
2use std::fs;
3use std::path::PathBuf;
4use std::str::FromStr;
5
6use serde::{Deserialize, Serialize};
7use strum::IntoEnumIterator;
8use strum_macros::EnumIter;
9
10use lindera_cc_cedict_builder::cc_cedict_builder::CcCedictBuilder;
11use lindera_core::character_definition::CharacterDefinitions;
12use lindera_core::connection::ConnectionCostMatrix;
13use lindera_core::dictionary::{Dictionary, UserDictionary};
14use lindera_core::dictionary_builder::DictionaryBuilder;
15use lindera_core::error::{LinderaError, LinderaErrorKind};
16use lindera_core::prefix_dict::PrefixDict;
17use lindera_core::unknown_dictionary::UnknownDictionary;
18use lindera_core::LinderaResult;
19use lindera_ipadic_builder::ipadic_builder::IpadicBuilder;
20use lindera_ipadic_neologd_builder::ipadic_neologd_builder::IpadicNeologdBuilder;
21use lindera_ko_dic_builder::ko_dic_builder::KoDicBuilder;
22use lindera_unidic_builder::unidic_builder::UnidicBuilder;
23
24#[derive(Debug, Clone, EnumIter, Deserialize, Serialize, PartialEq, Eq)]
25pub enum DictionaryKind {
26    #[serde(rename = "ipadic")]
27    IPADIC,
28    #[serde(rename = "ipadic-neologd")]
29    IPADICNEologd,
30    #[serde(rename = "unidic")]
31    UniDic,
32    #[serde(rename = "ko-dic")]
33    KoDic,
34    #[serde(rename = "cc-cedict")]
35    CcCedict,
36}
37
38impl DictionaryKind {
39    pub fn variants() -> Vec<DictionaryKind> {
40        DictionaryKind::iter().collect::<Vec<_>>()
41    }
42
43    pub fn contained_variants() -> Vec<DictionaryKind> {
44        DictionaryKind::variants()
45            .into_iter()
46            .filter(|kind| match kind {
47                DictionaryKind::IPADIC => cfg!(feature = "ipadic"),
48                DictionaryKind::IPADICNEologd => cfg!(feature = "ipadic-neologd"),
49                DictionaryKind::UniDic => cfg!(feature = "unidic"),
50                DictionaryKind::KoDic => cfg!(feature = "ko-dic"),
51                DictionaryKind::CcCedict => cfg!(feature = "cc-cedict"),
52            })
53            .collect::<Vec<_>>()
54    }
55
56    pub fn as_str(&self) -> &str {
57        match self {
58            DictionaryKind::IPADIC => "ipadic",
59            DictionaryKind::IPADICNEologd => "ipadic-neologd",
60            DictionaryKind::UniDic => "unidic",
61            DictionaryKind::KoDic => "ko-dic",
62            DictionaryKind::CcCedict => "cc-cedict",
63        }
64    }
65}
66
67impl FromStr for DictionaryKind {
68    type Err = LinderaError;
69    fn from_str(input: &str) -> Result<DictionaryKind, Self::Err> {
70        match input {
71            "ipadic" => Ok(DictionaryKind::IPADIC),
72            "ipadic-neologd" => Ok(DictionaryKind::IPADICNEologd),
73            "unidic" => Ok(DictionaryKind::UniDic),
74            "ko-dic" => Ok(DictionaryKind::KoDic),
75            "cc-cedict" => Ok(DictionaryKind::CcCedict),
76            _ => Err(LinderaErrorKind::DictionaryKindError
77                .with_error(anyhow::anyhow!("Invalid dictionary kind: {}", input))),
78        }
79    }
80}
81
82/// Dictionary config
83///
84/// Use this if you want to use a dictionary when tokenizing.
85///
86/// Either `kind` or `path` must be specified.
87///
88#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
89pub struct DictionaryConfig {
90    /// Specify the kind of dictionary (IPADIC, UniDic, ko-dic, CC-CEDICT) if a self-contained dictionary is used for tokenization.
91    pub kind: Option<DictionaryKind>,
92    /// Specifies the path to a pre-built external dictionary if one is used.
93    pub path: Option<PathBuf>,
94}
95
96/// User dictionary config
97///
98/// Use this if you want to use a user dictionary when tokenizing.
99///
100#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
101pub struct UserDictionaryConfig {
102    /// Path to the user dictionary file.
103    pub path: PathBuf,
104    /// If the user dictionary was in CSV format, specify the dictionary type (IPADIC, UniDic, ko-dic or CC-CEDICT).
105    pub kind: Option<DictionaryKind>,
106}
107
108pub struct DictionaryBuilderResolver {}
109
110impl DictionaryBuilderResolver {
111    pub fn resolve_builder(
112        dictionary_type: DictionaryKind,
113    ) -> LinderaResult<Box<dyn DictionaryBuilder>> {
114        match dictionary_type {
115            DictionaryKind::IPADIC => Ok(Box::new(IpadicBuilder::new())),
116            DictionaryKind::IPADICNEologd => Ok(Box::new(IpadicNeologdBuilder::new())),
117            DictionaryKind::UniDic => Ok(Box::new(UnidicBuilder::new())),
118            DictionaryKind::KoDic => Ok(Box::new(KoDicBuilder::new())),
119            DictionaryKind::CcCedict => Ok(Box::new(CcCedictBuilder::new())),
120        }
121    }
122}
123
124pub struct DictionaryLoader {}
125
126impl DictionaryLoader {
127    fn read_file(path: PathBuf) -> LinderaResult<Vec<u8>> {
128        fs::read(path).map_err(|e| LinderaErrorKind::Io.with_error(e))
129    }
130
131    pub fn prefix_dict(dir: PathBuf) -> LinderaResult<PrefixDict> {
132        let unidic_data_path = dir.join("dict.da");
133        let unidic_data = Self::read_file(unidic_data_path)?;
134
135        let unidic_vals_path = dir.join("dict.vals");
136        let unidic_vals = Self::read_file(unidic_vals_path)?;
137
138        Ok(PrefixDict::from_static_slice(
139            unidic_data.as_slice(),
140            unidic_vals.as_slice(),
141        ))
142    }
143
144    pub fn connection(dir: PathBuf) -> LinderaResult<ConnectionCostMatrix> {
145        let path = dir.join("matrix.mtx");
146        let data = Self::read_file(path)?;
147
148        Ok(ConnectionCostMatrix::load(data.as_slice()))
149    }
150
151    pub fn char_def(dir: PathBuf) -> LinderaResult<CharacterDefinitions> {
152        let path = dir.join("char_def.bin");
153        let data = Self::read_file(path)?;
154
155        CharacterDefinitions::load(data.as_slice())
156    }
157
158    pub fn unknown_dict(dir: PathBuf) -> LinderaResult<UnknownDictionary> {
159        let path = dir.join("unk.bin");
160        let data = Self::read_file(path)?;
161
162        UnknownDictionary::load(data.as_slice())
163    }
164
165    pub fn words_idx_data(dir: PathBuf) -> LinderaResult<Vec<u8>> {
166        let path = dir.join("dict.wordsidx");
167        Self::read_file(path)
168    }
169
170    pub fn words_data(dir: PathBuf) -> LinderaResult<Vec<u8>> {
171        let path = dir.join("dict.words");
172        Self::read_file(path)
173    }
174
175    pub fn load_dictionary(path: PathBuf) -> LinderaResult<Dictionary> {
176        Ok(Dictionary {
177            dict: Self::prefix_dict(path.clone())?,
178            cost_matrix: Self::connection(path.clone())?,
179            char_definitions: Self::char_def(path.clone())?,
180            unknown_dictionary: Self::unknown_dict(path.clone())?,
181            words_idx_data: Cow::Owned(Self::words_idx_data(path.clone())?),
182            words_data: Cow::Owned(Self::words_data(path)?),
183        })
184    }
185
186    pub fn load_dictionary_from_kind(kind: DictionaryKind) -> LinderaResult<Dictionary> {
187        // The dictionary specified by the feature flag will be loaded.
188        match kind {
189            #[cfg(feature = "ipadic")]
190            DictionaryKind::IPADIC => lindera_ipadic::load_dictionary()
191                .map_err(|e| LinderaErrorKind::DictionaryNotFound.with_error(e)),
192            #[cfg(feature = "ipadic-neologd")]
193            DictionaryKind::IPADICNEologd => lindera_ipadic_neologd::load_dictionary()
194                .map_err(|e| LinderaErrorKind::DictionaryNotFound.with_error(e)),
195            #[cfg(feature = "unidic")]
196            DictionaryKind::UniDic => lindera_unidic::load_dictionary()
197                .map_err(|e| LinderaErrorKind::DictionaryNotFound.with_error(e)),
198            #[cfg(feature = "ko-dic")]
199            DictionaryKind::KoDic => lindera_ko_dic::load_dictionary()
200                .map_err(|e| LinderaErrorKind::DictionaryNotFound.with_error(e)),
201            #[cfg(feature = "cc-cedict")]
202            DictionaryKind::CcCedict => lindera_cc_cedict::load_dictionary()
203                .map_err(|e| LinderaErrorKind::DictionaryNotFound.with_error(e)),
204            #[allow(unreachable_patterns)]
205            _ => Err(LinderaErrorKind::Args
206                .with_error(anyhow::anyhow!("Invalid dictionary type: {:?}", kind))),
207        }
208    }
209
210    pub fn load_dictionary_from_config(
211        dictionary_config: DictionaryConfig,
212    ) -> LinderaResult<Dictionary> {
213        match dictionary_config.kind {
214            Some(kind) => {
215                // The dictionary specified by the feature flag will be loaded.
216                Self::load_dictionary_from_kind(kind)
217            }
218            None => {
219                match dictionary_config.path {
220                    Some(path) => {
221                        // load external dictionary from path
222                        Self::load_dictionary(path)
223                    }
224                    None => Err(LinderaErrorKind::Args
225                        .with_error(anyhow::anyhow!("Dictionary must be specified"))),
226                }
227            }
228        }
229    }
230
231    pub fn load_user_dictionary_from_csv(
232        kind: DictionaryKind,
233        path: PathBuf,
234    ) -> LinderaResult<UserDictionary> {
235        let builder = DictionaryBuilderResolver::resolve_builder(kind)?;
236        builder
237            .build_user_dict(path.as_path())
238            .map_err(|err| LinderaErrorKind::DictionaryBuildError.with_error(err))
239    }
240
241    pub fn load_user_dictionary_from_bin(path: PathBuf) -> LinderaResult<UserDictionary> {
242        UserDictionary::load(&Self::read_file(path)?)
243    }
244
245    pub fn load_user_dictionary_from_config(
246        dictionary_config: UserDictionaryConfig,
247    ) -> LinderaResult<UserDictionary> {
248        match dictionary_config.path.extension() {
249            Some(ext) => match ext.to_str() {
250                Some("csv") => match dictionary_config.kind {
251                    Some(kind) => Self::load_user_dictionary_from_csv(kind, dictionary_config.path),
252                    None => Err(LinderaErrorKind::Args.with_error(anyhow::anyhow!(
253                        "Dictionary type must be specified if CSV file specified"
254                    ))),
255                },
256                Some("bin") => Self::load_user_dictionary_from_bin(dictionary_config.path),
257                _ => Err(LinderaErrorKind::Args.with_error(anyhow::anyhow!(
258                    "Invalid user dictionary source file extension"
259                ))),
260            },
261            None => Err(LinderaErrorKind::Args
262                .with_error(anyhow::anyhow!("Invalid user dictionary source file"))),
263        }
264    }
265}