Skip to main content

lindera_dictionary/
dictionary.rs

1pub mod character_definition;
2pub mod connection_cost_matrix;
3pub mod metadata;
4pub mod prefix_dictionary;
5pub mod schema;
6pub mod unknown_dictionary;
7
8use std::fs;
9use std::path::Path;
10use std::str;
11
12use byteorder::{ByteOrder, LittleEndian};
13use once_cell::sync::Lazy;
14use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
15use serde::{Deserialize, Serialize};
16
17use crate::LinderaResult;
18use crate::dictionary::character_definition::CharacterDefinition;
19use crate::dictionary::connection_cost_matrix::ConnectionCostMatrix;
20use crate::dictionary::metadata::Metadata;
21use crate::dictionary::prefix_dictionary::PrefixDictionary;
22use crate::dictionary::unknown_dictionary::UnknownDictionary;
23use crate::error::LinderaErrorKind;
24use crate::loader::character_definition::CharacterDefinitionLoader;
25use crate::loader::connection_cost_matrix::ConnectionCostMatrixLoader;
26use crate::loader::metadata::MetadataLoader;
27use crate::loader::prefix_dictionary::PrefixDictionaryLoader;
28use crate::loader::unknown_dictionary::UnknownDictionaryLoader;
29
30pub static UNK: Lazy<Vec<&str>> = Lazy::new(|| vec!["UNK"]);
31
32#[derive(Clone)]
33pub struct Dictionary {
34    pub prefix_dictionary: PrefixDictionary,
35    pub connection_cost_matrix: ConnectionCostMatrix,
36    pub character_definition: CharacterDefinition,
37    pub unknown_dictionary: UnknownDictionary,
38    pub metadata: Metadata,
39}
40
41impl Dictionary {
42    /// Retrieve the detail fields (POS, etc.) for an unknown word entry.
43    pub fn unknown_word_details(&self, word_id: usize) -> Vec<&str> {
44        match self.unknown_dictionary.word_details(word_id as u32) {
45            Some(details) => details,
46            None => UNK.to_vec(),
47        }
48    }
49
50    pub fn word_details(&self, word_id: usize) -> Vec<&str> {
51        if 4 * word_id >= self.prefix_dictionary.words_idx_data.len() {
52            return vec![];
53        }
54
55        let idx: usize = match LittleEndian::read_u32(
56            &self.prefix_dictionary.words_idx_data[4 * word_id..][..4],
57        )
58        .try_into()
59        {
60            Ok(value) => value,
61            Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
62        };
63        let data = &self.prefix_dictionary.words_data[idx..];
64        let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
65            Ok(value) => value,
66            Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
67        };
68        let joined_details_bytes: &[u8] =
69            &self.prefix_dictionary.words_data[idx + 4..idx + 4 + joined_details_len];
70
71        let mut details = Vec::new();
72        for bytes in joined_details_bytes.split(|&b| b == 0) {
73            let detail = match str::from_utf8(bytes) {
74                Ok(s) => s,
75                Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
76            };
77            details.push(detail);
78        }
79        details
80    }
81
82    /// Load dictionary from a directory containing dictionary files
83    pub fn load_from_path(dict_path: &Path) -> LinderaResult<Self> {
84        Self::load_from_path_with_options(dict_path, false)
85    }
86
87    /// Load dictionary from a directory with options
88    pub fn load_from_path_with_options(dict_path: &Path, use_mmap: bool) -> LinderaResult<Self> {
89        // Verify that the dictionary directory exists
90        if !dict_path.exists() {
91            return Err(LinderaErrorKind::Io.with_error(anyhow::anyhow!(
92                "Dictionary path does not exist: {}",
93                dict_path.display()
94            )));
95        }
96
97        if !dict_path.is_dir() {
98            return Err(LinderaErrorKind::Io.with_error(anyhow::anyhow!(
99                "Dictionary path is not a directory: {}",
100                dict_path.display()
101            )));
102        }
103
104        // Load each component from the dictionary directory
105        let metadata = MetadataLoader::load(dict_path)?;
106        let character_definition = CharacterDefinitionLoader::load(dict_path)?;
107
108        let connection_cost_matrix = {
109            #[cfg(feature = "mmap")]
110            if use_mmap {
111                ConnectionCostMatrixLoader::load_mmap(dict_path)?
112            } else {
113                ConnectionCostMatrixLoader::load(dict_path)?
114            }
115            #[cfg(not(feature = "mmap"))]
116            ConnectionCostMatrixLoader::load(dict_path)?
117        };
118
119        let prefix_dictionary = {
120            #[cfg(feature = "mmap")]
121            if use_mmap {
122                PrefixDictionaryLoader::load_mmap(dict_path)?
123            } else {
124                PrefixDictionaryLoader::load(dict_path)?
125            }
126            #[cfg(not(feature = "mmap"))]
127            PrefixDictionaryLoader::load(dict_path)?
128        };
129
130        let unknown_dictionary = UnknownDictionaryLoader::load(dict_path)?;
131
132        Ok(Dictionary {
133            prefix_dictionary,
134            connection_cost_matrix,
135            character_definition,
136            unknown_dictionary,
137            metadata,
138        })
139    }
140
141    /// Save dictionary to a directory
142    pub fn save_to_path(&self, dict_path: &Path) -> LinderaResult<()> {
143        // Create directory if it doesn't exist
144        fs::create_dir_all(dict_path)
145            .map_err(|err| LinderaErrorKind::Io.with_error(anyhow::anyhow!(err)))?;
146
147        // For now, we'll implement this as needed
148        // This would require implementing save methods for each component
149        todo!("Dictionary saving will be implemented when needed")
150    }
151}
152
153#[derive(Clone, Serialize, Deserialize, Archive, RkyvSerialize, RkyvDeserialize)]
154
155pub struct UserDictionary {
156    pub dict: PrefixDictionary,
157}
158
159impl UserDictionary {
160    pub fn load(user_dict_data: &[u8]) -> LinderaResult<UserDictionary> {
161        let mut aligned = rkyv::util::AlignedVec::<16>::new();
162        aligned.extend_from_slice(user_dict_data);
163        rkyv::from_bytes::<UserDictionary, rkyv::rancor::Error>(&aligned).map_err(|err| {
164            LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(err.to_string()))
165        })
166    }
167
168    pub fn word_details(&self, word_id: usize) -> Vec<&str> {
169        if 4 * word_id >= self.dict.words_idx_data.len() {
170            return UNK.to_vec(); // return empty vector if conversion fails
171        }
172        let idx = LittleEndian::read_u32(&self.dict.words_idx_data[4 * word_id..][..4]);
173        let data = &self.dict.words_data[idx as usize..];
174
175        // Parse the data in the same format as main Dictionary
176        let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
177            Ok(value) => value,
178            Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
179        };
180        let joined_details_bytes: &[u8] =
181            &self.dict.words_data[idx as usize + 4..idx as usize + 4 + joined_details_len];
182
183        let mut details = Vec::new();
184        for bytes in joined_details_bytes.split(|&b| b == 0) {
185            let detail = match str::from_utf8(bytes) {
186                Ok(s) => s,
187                Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
188            };
189            details.push(detail);
190        }
191        details
192    }
193}