lindera_dictionary/
dictionary.rs

1pub mod character_definition;
2pub mod connection_cost_matrix;
3pub mod metadata;
4pub mod prefix_dictionary;
5pub mod schema;
6pub mod unknown_dictionary;
7
8use std::fs;
9use std::path::Path;
10use std::str;
11
12use byteorder::{ByteOrder, LittleEndian};
13use once_cell::sync::Lazy;
14use serde::{Deserialize, Serialize};
15
16use crate::LinderaResult;
17use crate::dictionary::character_definition::CharacterDefinition;
18use crate::dictionary::connection_cost_matrix::ConnectionCostMatrix;
19use crate::dictionary::metadata::Metadata;
20use crate::dictionary::prefix_dictionary::PrefixDictionary;
21use crate::dictionary::unknown_dictionary::UnknownDictionary;
22use crate::error::LinderaErrorKind;
23use crate::loader::character_definition::CharacterDefinitionLoader;
24use crate::loader::connection_cost_matrix::ConnectionCostMatrixLoader;
25use crate::loader::metadata::MetadataLoader;
26use crate::loader::prefix_dictionary::PrefixDictionaryLoader;
27use crate::loader::unknown_dictionary::UnknownDictionaryLoader;
28
29pub static UNK: Lazy<Vec<&str>> = Lazy::new(|| vec!["UNK"]);
30
31#[derive(Clone)]
32pub struct Dictionary {
33    pub prefix_dictionary: PrefixDictionary,
34    pub connection_cost_matrix: ConnectionCostMatrix,
35    pub character_definition: CharacterDefinition,
36    pub unknown_dictionary: UnknownDictionary,
37    pub metadata: Metadata,
38}
39
40impl Dictionary {
41    pub fn word_details(&self, word_id: usize) -> Vec<&str> {
42        if 4 * word_id >= self.prefix_dictionary.words_idx_data.len() {
43            return vec![];
44        }
45
46        let idx: usize = match LittleEndian::read_u32(
47            &self.prefix_dictionary.words_idx_data[4 * word_id..][..4],
48        )
49        .try_into()
50        {
51            Ok(value) => value,
52            Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
53        };
54        let data = &self.prefix_dictionary.words_data[idx..];
55        let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
56            Ok(value) => value,
57            Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
58        };
59        let joined_details_bytes: &[u8] =
60            &self.prefix_dictionary.words_data[idx + 4..idx + 4 + joined_details_len];
61
62        let mut details = Vec::new();
63        for bytes in joined_details_bytes.split(|&b| b == 0) {
64            let detail = match str::from_utf8(bytes) {
65                Ok(s) => s,
66                Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
67            };
68            details.push(detail);
69        }
70        details
71    }
72
73    /// Load dictionary from a directory containing dictionary files
74    pub fn load_from_path(dict_path: &Path) -> LinderaResult<Self> {
75        Self::load_from_path_with_options(dict_path, false)
76    }
77
78    /// Load dictionary from a directory with options
79    pub fn load_from_path_with_options(dict_path: &Path, use_mmap: bool) -> LinderaResult<Self> {
80        // Verify that the dictionary directory exists
81        if !dict_path.exists() {
82            return Err(LinderaErrorKind::Io.with_error(anyhow::anyhow!(
83                "Dictionary path does not exist: {}",
84                dict_path.display()
85            )));
86        }
87
88        if !dict_path.is_dir() {
89            return Err(LinderaErrorKind::Io.with_error(anyhow::anyhow!(
90                "Dictionary path is not a directory: {}",
91                dict_path.display()
92            )));
93        }
94
95        // Load each component from the dictionary directory
96        let metadata = MetadataLoader::load(dict_path)?;
97        let character_definition = CharacterDefinitionLoader::load(dict_path)?;
98
99        let connection_cost_matrix = {
100            #[cfg(feature = "mmap")]
101            if use_mmap {
102                ConnectionCostMatrixLoader::load_mmap(dict_path)?
103            } else {
104                ConnectionCostMatrixLoader::load(dict_path)?
105            }
106            #[cfg(not(feature = "mmap"))]
107            ConnectionCostMatrixLoader::load(dict_path)?
108        };
109
110        let prefix_dictionary = {
111            #[cfg(feature = "mmap")]
112            if use_mmap {
113                PrefixDictionaryLoader::load_mmap(dict_path)?
114            } else {
115                PrefixDictionaryLoader::load(dict_path)?
116            }
117            #[cfg(not(feature = "mmap"))]
118            PrefixDictionaryLoader::load(dict_path)?
119        };
120
121        let unknown_dictionary = UnknownDictionaryLoader::load(dict_path)?;
122
123        Ok(Dictionary {
124            prefix_dictionary,
125            connection_cost_matrix,
126            character_definition,
127            unknown_dictionary,
128            metadata,
129        })
130    }
131
132    /// Save dictionary to a directory
133    pub fn save_to_path(&self, dict_path: &Path) -> LinderaResult<()> {
134        // Create directory if it doesn't exist
135        fs::create_dir_all(dict_path)
136            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
137
138        // For now, we'll implement this as needed
139        // This would require implementing save methods for each component
140        todo!("Dictionary saving will be implemented when needed")
141    }
142}
143
144#[derive(Clone, Serialize, Deserialize)]
145pub struct UserDictionary {
146    pub dict: PrefixDictionary,
147}
148
149impl UserDictionary {
150    pub fn load(user_dict_data: &[u8]) -> LinderaResult<UserDictionary> {
151        bincode::serde::decode_from_slice(user_dict_data, bincode::config::legacy())
152            .map(|(result, _len)| result)
153            .map_err(|err| LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(err)))
154    }
155
156    pub fn word_details(&self, word_id: usize) -> Vec<&str> {
157        if 4 * word_id >= self.dict.words_idx_data.len() {
158            return UNK.to_vec(); // return empty vector if conversion fails
159        }
160        let idx = LittleEndian::read_u32(&self.dict.words_idx_data[4 * word_id..][..4]);
161        let data = &self.dict.words_data[idx as usize..];
162
163        // Parse the data in the same format as main Dictionary
164        let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
165            Ok(value) => value,
166            Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
167        };
168        let joined_details_bytes: &[u8] =
169            &self.dict.words_data[idx as usize + 4..idx as usize + 4 + joined_details_len];
170
171        let mut details = Vec::new();
172        for bytes in joined_details_bytes.split(|&b| b == 0) {
173            let detail = match str::from_utf8(bytes) {
174                Ok(s) => s,
175                Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
176            };
177            details.push(detail);
178        }
179        details
180    }
181}