lindera_dictionary/
dictionary.rs1pub mod character_definition;
2pub mod connection_cost_matrix;
3pub mod metadata;
4pub mod prefix_dictionary;
5pub mod schema;
6pub mod unknown_dictionary;
7
8use std::fs;
9use std::path::Path;
10use std::str;
11
12use byteorder::{ByteOrder, LittleEndian};
13use once_cell::sync::Lazy;
14use serde::{Deserialize, Serialize};
15
16use crate::LinderaResult;
17use crate::dictionary::character_definition::CharacterDefinition;
18use crate::dictionary::connection_cost_matrix::ConnectionCostMatrix;
19use crate::dictionary::metadata::Metadata;
20use crate::dictionary::prefix_dictionary::PrefixDictionary;
21use crate::dictionary::unknown_dictionary::UnknownDictionary;
22use crate::dictionary_loader::character_definition::CharacterDefinitionLoader;
23use crate::dictionary_loader::connection_cost_matrix::ConnectionCostMatrixLoader;
24use crate::dictionary_loader::metadata::MetadataLoader;
25use crate::dictionary_loader::prefix_dictionary::PrefixDictionaryLoader;
26use crate::dictionary_loader::unknown_dictionary::UnknownDictionaryLoader;
27use crate::error::LinderaErrorKind;
28
29pub static UNK: Lazy<Vec<&str>> = Lazy::new(|| vec!["UNK"]);
30
31#[derive(Clone)]
32pub struct Dictionary {
33 pub prefix_dictionary: PrefixDictionary,
34 pub connection_cost_matrix: ConnectionCostMatrix,
35 pub character_definition: CharacterDefinition,
36 pub unknown_dictionary: UnknownDictionary,
37 pub metadata: Metadata,
38}
39
40impl Dictionary {
41 pub fn word_details(&self, word_id: usize) -> Vec<&str> {
42 if 4 * word_id >= self.prefix_dictionary.words_idx_data.len() {
43 return vec![];
44 }
45
46 let idx: usize = match LittleEndian::read_u32(
47 &self.prefix_dictionary.words_idx_data[4 * word_id..][..4],
48 )
49 .try_into()
50 {
51 Ok(value) => value,
52 Err(_) => return UNK.to_vec(), };
54 let data = &self.prefix_dictionary.words_data[idx..];
55 let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
56 Ok(value) => value,
57 Err(_) => return UNK.to_vec(), };
59 let joined_details_bytes: &[u8] =
60 &self.prefix_dictionary.words_data[idx + 4..idx + 4 + joined_details_len];
61
62 let mut details = Vec::new();
63 for bytes in joined_details_bytes.split(|&b| b == 0) {
64 let detail = match str::from_utf8(bytes) {
65 Ok(s) => s,
66 Err(_) => return UNK.to_vec(), };
68 details.push(detail);
69 }
70 details
71 }
72
73 pub fn load_from_path(dict_path: &Path) -> LinderaResult<Self> {
75 Self::load_from_path_with_options(dict_path, false)
76 }
77
78 pub fn load_from_path_with_options(dict_path: &Path, use_mmap: bool) -> LinderaResult<Self> {
80 if !dict_path.exists() {
82 return Err(LinderaErrorKind::Io.with_error(anyhow::anyhow!(
83 "Dictionary path does not exist: {}",
84 dict_path.display()
85 )));
86 }
87
88 if !dict_path.is_dir() {
89 return Err(LinderaErrorKind::Io.with_error(anyhow::anyhow!(
90 "Dictionary path is not a directory: {}",
91 dict_path.display()
92 )));
93 }
94
95 let metadata = MetadataLoader::load(dict_path)?;
97 let character_definition = CharacterDefinitionLoader::load(dict_path)?;
98
99 let connection_cost_matrix = {
100 #[cfg(feature = "mmap")]
101 if use_mmap {
102 ConnectionCostMatrixLoader::load_mmap(dict_path)?
103 } else {
104 ConnectionCostMatrixLoader::load(dict_path)?
105 }
106 #[cfg(not(feature = "mmap"))]
107 ConnectionCostMatrixLoader::load(dict_path)?
108 };
109
110 let prefix_dictionary = {
111 #[cfg(feature = "mmap")]
112 if use_mmap {
113 PrefixDictionaryLoader::load_mmap(dict_path)?
114 } else {
115 PrefixDictionaryLoader::load(dict_path)?
116 }
117 #[cfg(not(feature = "mmap"))]
118 PrefixDictionaryLoader::load(dict_path)?
119 };
120
121 let unknown_dictionary = UnknownDictionaryLoader::load(dict_path)?;
122
123 Ok(Dictionary {
124 prefix_dictionary,
125 connection_cost_matrix,
126 character_definition,
127 unknown_dictionary,
128 metadata,
129 })
130 }
131
132 pub fn save_to_path(&self, dict_path: &Path) -> LinderaResult<()> {
134 fs::create_dir_all(dict_path)
136 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
137
138 todo!("Dictionary saving will be implemented when needed")
141 }
142}
143
144#[derive(Clone, Serialize, Deserialize)]
145pub struct UserDictionary {
146 pub dict: PrefixDictionary,
147}
148
149impl UserDictionary {
150 pub fn load(user_dict_data: &[u8]) -> LinderaResult<UserDictionary> {
151 bincode::serde::decode_from_slice(user_dict_data, bincode::config::legacy())
152 .map(|(result, _len)| result)
153 .map_err(|err| LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(err)))
154 }
155
156 pub fn word_details(&self, word_id: usize) -> Vec<&str> {
157 if 4 * word_id >= self.dict.words_idx_data.len() {
158 return UNK.to_vec(); }
160 let idx = LittleEndian::read_u32(&self.dict.words_idx_data[4 * word_id..][..4]);
161 let data = &self.dict.words_data[idx as usize..];
162
163 let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
165 Ok(value) => value,
166 Err(_) => return UNK.to_vec(), };
168 let joined_details_bytes: &[u8] =
169 &self.dict.words_data[idx as usize + 4..idx as usize + 4 + joined_details_len];
170
171 let mut details = Vec::new();
172 for bytes in joined_details_bytes.split(|&b| b == 0) {
173 let detail = match str::from_utf8(bytes) {
174 Ok(s) => s,
175 Err(_) => return UNK.to_vec(), };
177 details.push(detail);
178 }
179 details
180 }
181}