lindera_dictionary/
dictionary.rs1pub mod character_definition;
2pub mod connection_cost_matrix;
3pub mod metadata;
4pub mod prefix_dictionary;
5pub mod schema;
6pub mod unknown_dictionary;
7
8use std::fs;
9use std::path::Path;
10use std::str;
11
12use byteorder::{ByteOrder, LittleEndian};
13use once_cell::sync::Lazy;
14use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
15use serde::{Deserialize, Serialize};
16
17use crate::LinderaResult;
18use crate::dictionary::character_definition::CharacterDefinition;
19use crate::dictionary::connection_cost_matrix::ConnectionCostMatrix;
20use crate::dictionary::metadata::Metadata;
21use crate::dictionary::prefix_dictionary::PrefixDictionary;
22use crate::dictionary::unknown_dictionary::UnknownDictionary;
23use crate::error::LinderaErrorKind;
24use crate::loader::character_definition::CharacterDefinitionLoader;
25use crate::loader::connection_cost_matrix::ConnectionCostMatrixLoader;
26use crate::loader::metadata::MetadataLoader;
27use crate::loader::prefix_dictionary::PrefixDictionaryLoader;
28use crate::loader::unknown_dictionary::UnknownDictionaryLoader;
29
30pub static UNK: Lazy<Vec<&str>> = Lazy::new(|| vec!["UNK"]);
31
32#[derive(Clone)]
33pub struct Dictionary {
34 pub prefix_dictionary: PrefixDictionary,
35 pub connection_cost_matrix: ConnectionCostMatrix,
36 pub character_definition: CharacterDefinition,
37 pub unknown_dictionary: UnknownDictionary,
38 pub metadata: Metadata,
39}
40
41impl Dictionary {
42 pub fn word_details(&self, word_id: usize) -> Vec<&str> {
43 if 4 * word_id >= self.prefix_dictionary.words_idx_data.len() {
44 return vec![];
45 }
46
47 let idx: usize = match LittleEndian::read_u32(
48 &self.prefix_dictionary.words_idx_data[4 * word_id..][..4],
49 )
50 .try_into()
51 {
52 Ok(value) => value,
53 Err(_) => return UNK.to_vec(), };
55 let data = &self.prefix_dictionary.words_data[idx..];
56 let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
57 Ok(value) => value,
58 Err(_) => return UNK.to_vec(), };
60 let joined_details_bytes: &[u8] =
61 &self.prefix_dictionary.words_data[idx + 4..idx + 4 + joined_details_len];
62
63 let mut details = Vec::new();
64 for bytes in joined_details_bytes.split(|&b| b == 0) {
65 let detail = match str::from_utf8(bytes) {
66 Ok(s) => s,
67 Err(_) => return UNK.to_vec(), };
69 details.push(detail);
70 }
71 details
72 }
73
74 pub fn load_from_path(dict_path: &Path) -> LinderaResult<Self> {
76 Self::load_from_path_with_options(dict_path, false)
77 }
78
79 pub fn load_from_path_with_options(dict_path: &Path, use_mmap: bool) -> LinderaResult<Self> {
81 if !dict_path.exists() {
83 return Err(LinderaErrorKind::Io.with_error(anyhow::anyhow!(
84 "Dictionary path does not exist: {}",
85 dict_path.display()
86 )));
87 }
88
89 if !dict_path.is_dir() {
90 return Err(LinderaErrorKind::Io.with_error(anyhow::anyhow!(
91 "Dictionary path is not a directory: {}",
92 dict_path.display()
93 )));
94 }
95
96 let metadata = MetadataLoader::load(dict_path)?;
98 let character_definition = CharacterDefinitionLoader::load(dict_path)?;
99
100 let connection_cost_matrix = {
101 #[cfg(feature = "mmap")]
102 if use_mmap {
103 ConnectionCostMatrixLoader::load_mmap(dict_path)?
104 } else {
105 ConnectionCostMatrixLoader::load(dict_path)?
106 }
107 #[cfg(not(feature = "mmap"))]
108 ConnectionCostMatrixLoader::load(dict_path)?
109 };
110
111 let prefix_dictionary = {
112 #[cfg(feature = "mmap")]
113 if use_mmap {
114 PrefixDictionaryLoader::load_mmap(dict_path)?
115 } else {
116 PrefixDictionaryLoader::load(dict_path)?
117 }
118 #[cfg(not(feature = "mmap"))]
119 PrefixDictionaryLoader::load(dict_path)?
120 };
121
122 let unknown_dictionary = UnknownDictionaryLoader::load(dict_path)?;
123
124 Ok(Dictionary {
125 prefix_dictionary,
126 connection_cost_matrix,
127 character_definition,
128 unknown_dictionary,
129 metadata,
130 })
131 }
132
133 pub fn save_to_path(&self, dict_path: &Path) -> LinderaResult<()> {
135 fs::create_dir_all(dict_path)
137 .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
138
139 todo!("Dictionary saving will be implemented when needed")
142 }
143}
144
145#[derive(Clone, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize)]
146
147pub struct UserDictionary {
148 pub dict: PrefixDictionary,
149}
150
151impl UserDictionary {
152 pub fn load(user_dict_data: &[u8]) -> LinderaResult<UserDictionary> {
153 let mut aligned = rkyv::util::AlignedVec::<16>::new();
154 aligned.extend_from_slice(user_dict_data);
155 rkyv::from_bytes::<UserDictionary, rkyv::rancor::Error>(&aligned).map_err(|err| {
156 LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(err.to_string()))
157 })
158 }
159
160 pub fn word_details(&self, word_id: usize) -> Vec<&str> {
161 if 4 * word_id >= self.dict.words_idx_data.len() {
162 return UNK.to_vec(); }
164 let idx = LittleEndian::read_u32(&self.dict.words_idx_data[4 * word_id..][..4]);
165 let data = &self.dict.words_data[idx as usize..];
166
167 let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
169 Ok(value) => value,
170 Err(_) => return UNK.to_vec(), };
172 let joined_details_bytes: &[u8] =
173 &self.dict.words_data[idx as usize + 4..idx as usize + 4 + joined_details_len];
174
175 let mut details = Vec::new();
176 for bytes in joined_details_bytes.split(|&b| b == 0) {
177 let detail = match str::from_utf8(bytes) {
178 Ok(s) => s,
179 Err(_) => return UNK.to_vec(), };
181 details.push(detail);
182 }
183 details
184 }
185}