lindera_dictionary/
dictionary.rs1pub mod character_definition;
2pub mod connection_cost_matrix;
3pub mod prefix_dictionary;
4pub mod unknown_dictionary;
5
6use std::str;
7
8use byteorder::{ByteOrder, LittleEndian};
9use once_cell::sync::Lazy;
10use serde::{Deserialize, Serialize};
11
12use crate::LinderaResult;
13use crate::dictionary::character_definition::CharacterDefinition;
14use crate::dictionary::connection_cost_matrix::ConnectionCostMatrix;
15use crate::dictionary::prefix_dictionary::PrefixDictionary;
16use crate::dictionary::unknown_dictionary::UnknownDictionary;
17use crate::error::LinderaErrorKind;
18
19pub static UNK: Lazy<Vec<&str>> = Lazy::new(|| vec!["UNK"]);
20
21#[derive(Clone)]
22pub struct Dictionary {
23 pub prefix_dictionary: PrefixDictionary,
24 pub connection_cost_matrix: ConnectionCostMatrix,
25 pub character_definition: CharacterDefinition,
26 pub unknown_dictionary: UnknownDictionary,
27}
28
29impl Dictionary {
30 pub fn word_details(&self, word_id: usize) -> Vec<&str> {
31 if 4 * word_id >= self.prefix_dictionary.words_idx_data.len() {
32 return vec![];
33 }
34
35 let idx: usize = match LittleEndian::read_u32(
36 &self.prefix_dictionary.words_idx_data[4 * word_id..][..4],
37 )
38 .try_into()
39 {
40 Ok(value) => value,
41 Err(_) => return UNK.to_vec(), };
43 let data = &self.prefix_dictionary.words_data[idx..];
44 let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
45 Ok(value) => value,
46 Err(_) => return UNK.to_vec(), };
48 let joined_details_bytes: &[u8] =
49 &self.prefix_dictionary.words_data[idx + 4..idx + 4 + joined_details_len];
50
51 let mut details = Vec::new();
52 for bytes in joined_details_bytes.split(|&b| b == 0) {
53 let detail = match str::from_utf8(bytes) {
54 Ok(s) => s,
55 Err(_) => return UNK.to_vec(), };
57 details.push(detail);
58 }
59 details
60 }
61}
62
63#[derive(Clone, Serialize, Deserialize)]
64pub struct UserDictionary {
65 pub dict: PrefixDictionary,
66}
67
68impl UserDictionary {
69 pub fn load(user_dict_data: &[u8]) -> LinderaResult<UserDictionary> {
70 bincode::serde::decode_from_slice(user_dict_data, bincode::config::legacy())
71 .map(|(result, _len)| result)
72 .map_err(|err| LinderaErrorKind::Deserialize.with_error(anyhow::anyhow!(err)))
73 }
74
75 pub fn word_details(&self, word_id: usize) -> Vec<&str> {
76 if 4 * word_id >= self.dict.words_idx_data.len() {
77 return UNK.to_vec(); }
79 let idx = LittleEndian::read_u32(&self.dict.words_idx_data[4 * word_id..][..4]);
80 let data = &self.dict.words_data[idx as usize..];
81
82 let joined_details_len: usize = match LittleEndian::read_u32(data).try_into() {
84 Ok(value) => value,
85 Err(_) => return UNK.to_vec(), };
87 let joined_details_bytes: &[u8] =
88 &self.dict.words_data[idx as usize + 4..idx as usize + 4 + joined_details_len];
89
90 let mut details = Vec::new();
91 for bytes in joined_details_bytes.split(|&b| b == 0) {
92 let detail = match str::from_utf8(bytes) {
93 Ok(s) => s,
94 Err(_) => return UNK.to_vec(), };
96 details.push(detail);
97 }
98 details
99 }
100}