lindera_dictionary/
dictionary.rs

1pub mod character_definition;
2pub mod connection_cost_matrix;
3pub mod prefix_dictionary;
4pub mod unknown_dictionary;
5
6use std::str;
7
8use byteorder::{ByteOrder, LittleEndian};
9use once_cell::sync::Lazy;
10use serde::{Deserialize, Serialize};
11
12use crate::LinderaResult;
13use crate::dictionary::character_definition::CharacterDefinition;
14use crate::dictionary::connection_cost_matrix::ConnectionCostMatrix;
15use crate::dictionary::prefix_dictionary::PrefixDictionary;
16use crate::dictionary::unknown_dictionary::UnknownDictionary;
17use crate::error::LinderaErrorKind;
18
19pub static UNK: Lazy<Vec<&str>> = Lazy::new(|| vec!["UNK"]);
20
21#[derive(Clone)]
22pub struct Dictionary {
23    pub prefix_dictionary: PrefixDictionary,
24    pub connection_cost_matrix: ConnectionCostMatrix,
25    pub character_definition: CharacterDefinition,
26    pub unknown_dictionary: UnknownDictionary,
27}
28
29impl Dictionary {
30    pub fn word_details(&self, word_id: usize) -> Vec<&str> {
31        if 4 * word_id >= self.prefix_dictionary.words_idx_data.len() {
32            return vec![];
33        }
34
35        let idx: usize = match LittleEndian::read_u32(
36            &self.prefix_dictionary.words_idx_data[4 * word_id..][..4],
37        )
38        .try_into()
39        {
40            Ok(value) => value,
41            Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
42        };
43        let data = &self.prefix_dictionary.words_data[idx..];
44        let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
45            Ok(value) => value,
46            Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
47        };
48        let joined_details_bytes: &[u8] =
49            &self.prefix_dictionary.words_data[idx + 4..idx + 4 + joined_details_len];
50
51        let mut details = Vec::new();
52        for bytes in joined_details_bytes.split(|&b| b == 0) {
53            let detail = match str::from_utf8(bytes) {
54                Ok(s) => s,
55                Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
56            };
57            details.push(detail);
58        }
59        details
60    }
61}
62
63#[derive(Clone, Serialize, Deserialize)]
64pub struct UserDictionary {
65    pub dict: PrefixDictionary,
66}
67
68impl UserDictionary {
69    pub fn load(user_dict_data: &[u8]) -> LinderaResult<UserDictionary> {
70        bincode::serde::decode_from_slice(user_dict_data, bincode::config::legacy())
71            .map(|(result, _len)| result)
72            .map_err(|err| LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(err)))
73    }
74
75    pub fn word_details(&self, word_id: usize) -> Vec<&str> {
76        if 4 * word_id >= self.dict.words_idx_data.len() {
77            return UNK.to_vec(); // return empty vector if conversion fails
78        }
79        let idx = LittleEndian::read_u32(&self.dict.words_idx_data[4 * word_id..][..4]);
80        let data = &self.dict.words_data[idx as usize..];
81
82        // Parse the data in the same format as main Dictionary
83        let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
84            Ok(value) => value,
85            Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
86        };
87        let joined_details_bytes: &[u8] =
88            &self.dict.words_data[idx as usize + 4..idx as usize + 4 + joined_details_len];
89
90        let mut details = Vec::new();
91        for bytes in joined_details_bytes.split(|&b| b == 0) {
92            let detail = match str::from_utf8(bytes) {
93                Ok(s) => s,
94                Err(_) => return UNK.to_vec(), // return empty vector if conversion fails
95            };
96            details.push(detail);
97        }
98        details
99    }
100}