chewing/dictionary/
uhash.rs1use std::{
14 ffi::{c_int, c_ushort},
15 io::{self, BufRead, BufReader, Read},
16 mem::size_of,
17 str::{self, FromStr},
18};
19
20use crate::zhuyin::Syllable;
21
22use super::Phrase;
23
24const BIN_FIELD_SIZE: usize = 125;
25const BIN_HASH_SIG: &str = "CBiH";
26
27fn invalid_data() -> io::Error {
28 io::ErrorKind::InvalidData.into()
29}
30
31fn try_parse<T: FromStr>(input: Option<&str>) -> io::Result<T> {
32 input
33 .ok_or(invalid_data())?
34 .parse::<T>()
35 .or(Err(invalid_data()))
36}
37
38pub(crate) fn try_load_text<R: Read>(input: R) -> io::Result<Vec<(Vec<Syllable>, Phrase)>> {
39 let reader = BufReader::new(input);
40 let mut lines = reader.lines();
41
42 let first_line = lines.next().ok_or(invalid_data())?;
43 let _lifetime: c_ushort = first_line?.parse().or(Err(invalid_data()))?;
44
45 let mut result = Vec::new();
46 for line in lines {
47 let line = line?;
48 let mut columns = line.split_ascii_whitespace();
49 let phrase_str = columns.next().ok_or(invalid_data())?;
50 let n_chars = phrase_str.chars().count();
51 let mut syllables: Vec<Syllable> = Vec::new();
52 for _ in 0..n_chars {
53 let syl_u16: u16 = try_parse(columns.next())?;
54 syllables.push(syl_u16.try_into().or(Err(invalid_data()))?);
55 }
56 let user_freq = try_parse(columns.next())?;
57 let recent_time = try_parse(columns.next())?;
58 let _max_freq: u32 = try_parse(columns.next())?;
59 let _orig_freq: u32 = try_parse(columns.next())?;
60 result.push((
61 syllables,
62 Phrase::new(phrase_str, user_freq).with_time(recent_time),
63 ));
64 }
65
66 Ok(result)
67}
68
69pub(crate) fn try_load_bin<R: Read>(mut input: R) -> io::Result<Vec<(Vec<Syllable>, Phrase)>> {
70 let mut buf = [0_u8; BIN_FIELD_SIZE];
71
72 input.read_exact(&mut buf[0..BIN_HASH_SIG.len()])?;
73 if !buf.starts_with(BIN_HASH_SIG.as_bytes()) {
74 return Err(invalid_data());
75 }
76 input.read_exact(&mut buf[0..size_of::<c_int>()])?;
78
79 let mut result = Vec::new();
80 loop {
81 if input.read_exact(&mut buf).is_err() {
82 break;
83 }
84
85 let user_freq: i32 = i32::from_ne_bytes(buf[0..4].try_into().unwrap());
87 let recent_time: i32 = i32::from_ne_bytes(buf[4..8].try_into().unwrap());
88 let _max_freq: i32 = i32::from_ne_bytes(buf[8..12].try_into().unwrap());
89 let _orig_freq: i32 = i32::from_ne_bytes(buf[12..16].try_into().unwrap());
90
91 if user_freq < 0 || recent_time < 0 || _max_freq < 0 || _orig_freq < 0 {
96 continue;
97 }
98
99 let len = buf[16] as usize;
100 if buf[17 + (2 * len) + 1] == 0 {
102 continue;
104 }
105 let mut syllables: Vec<Syllable> = Vec::new();
106 let mut base = 17;
107 for _ in 0..len {
108 let syl_u16 = u16::from_ne_bytes(buf[base..base + 2].try_into().unwrap());
109 syllables.push(syl_u16.try_into().or(Err(invalid_data()))?);
110 base += 2;
111 }
112 let bytes = buf[base] as usize;
113 let phrase_str = str::from_utf8(&buf[base + 1..base + bytes + 1]);
114 if phrase_str.is_err() {
115 continue;
116 }
117
118 result.push((
119 syllables,
120 Phrase::new(phrase_str.unwrap(), user_freq as u32).with_time(recent_time as u64),
121 ));
122 }
123 Ok(result)
124}
125
126#[cfg(test)]
127mod tests {
128 use std::{ffi::c_int, mem::size_of};
129
130 use crate::zhuyin::Syllable;
131
132 use super::{BIN_FIELD_SIZE, Phrase, try_load_bin, try_load_text};
133
134 #[test]
135 fn load_valid_text() {
136 let input = b"42\nP 1 1 2 3 4\n";
137 let phrases = try_load_text(&input[..]).unwrap();
138 assert_eq!(
139 vec![(
140 vec![Syllable::try_from(1).unwrap()],
141 Phrase::from(("P", 1, 2))
142 )],
143 phrases
144 );
145 }
146
147 #[test]
148 fn load_truncated_text() {
149 let input = b"42\nPhrase 1 2 3 4 5 6\n";
150 let phrases = try_load_text(&input[..]);
151 assert!(phrases.is_err());
152 }
153
154 #[test]
155 fn load_malformed_text() {
156 let input = br#"<?xml version="1.0" encoding="UTF-8"?>\n"#;
157 let phrases = try_load_text(&input[..]);
158 assert!(phrases.is_err());
159 }
160
161 #[test]
162 fn load_binary_as_text() {
163 let input = b"CBiH\0\0\0\0";
164 let phrases = try_load_text(&input[..]);
165 assert!(phrases.is_err());
166 }
167
168 #[test]
169 fn load_valid_bin() {
170 let mut input = vec![b'C', b'B', b'i', b'H'];
171 input.extend_from_slice(&(0 as c_int).to_ne_bytes());
172 input.extend_from_slice(&1_i32.to_ne_bytes());
173 input.extend_from_slice(&2_i32.to_ne_bytes());
174 input.extend_from_slice(&3_i32.to_ne_bytes());
175 input.extend_from_slice(&4_i32.to_ne_bytes());
176 input.push(1);
177 input.extend_from_slice(&1_u16.to_ne_bytes());
178 input.push(1);
179 input.extend_from_slice(b"P");
180 input.extend(std::iter::repeat_n(
181 0,
182 BIN_FIELD_SIZE - input.len() + 4 + size_of::<c_int>(),
183 ));
184 let phrases = try_load_bin(&input[..]).unwrap();
185 assert_eq!(
186 vec![(
187 vec![Syllable::try_from(1).unwrap()],
188 Phrase::from(("P", 1, 2))
189 )],
190 phrases
191 );
192 }
193}