chewing/dictionary/
uhash.rs

1//! Reads old text or binary formatted user dictionary.
2//!
3//! The original chewing stores user dictionary in a text file named
4//! `uhash.dat`. The file starts with decimal integer which stores the lifetime
5//! record. Each line after the lifetime record is a user phrase record
6//! delimited by spaces.
7//!
8//! Each user phrase record starts with a UTF-8 encoded phrase string, followed
9//! by N decimal integers where N is the number of characters in the phrase,
10//! followed by 4 decimal integers which are userFreq, recentTime, maxFreq,
11//! origFreq respectively.
12
13use std::{
14    ffi::{c_int, c_ushort},
15    io::{self, BufRead, BufReader, Read},
16    mem::size_of,
17    str::{self, FromStr},
18};
19
20use crate::zhuyin::Syllable;
21
22use super::Phrase;
23
24const BIN_FIELD_SIZE: usize = 125;
25const BIN_HASH_SIG: &str = "CBiH";
26
27fn invalid_data() -> io::Error {
28    io::ErrorKind::InvalidData.into()
29}
30
31fn try_parse<T: FromStr>(input: Option<&str>) -> io::Result<T> {
32    input
33        .ok_or(invalid_data())?
34        .parse::<T>()
35        .or(Err(invalid_data()))
36}
37
38pub(crate) fn try_load_text<R: Read>(input: R) -> io::Result<Vec<(Vec<Syllable>, Phrase)>> {
39    let reader = BufReader::new(input);
40    let mut lines = reader.lines();
41
42    let first_line = lines.next().ok_or(invalid_data())?;
43    let _lifetime: c_ushort = first_line?.parse().or(Err(invalid_data()))?;
44
45    let mut result = Vec::new();
46    for line in lines {
47        let line = line?;
48        let mut columns = line.split_ascii_whitespace();
49        let phrase_str = columns.next().ok_or(invalid_data())?;
50        let n_chars = phrase_str.chars().count();
51        let mut syllables: Vec<Syllable> = Vec::new();
52        for _ in 0..n_chars {
53            let syl_u16: u16 = try_parse(columns.next())?;
54            syllables.push(syl_u16.try_into().or(Err(invalid_data()))?);
55        }
56        let user_freq = try_parse(columns.next())?;
57        let recent_time = try_parse(columns.next())?;
58        let _max_freq: u32 = try_parse(columns.next())?;
59        let _orig_freq: u32 = try_parse(columns.next())?;
60        result.push((
61            syllables,
62            Phrase::new(phrase_str, user_freq).with_time(recent_time),
63        ));
64    }
65
66    Ok(result)
67}
68
69pub(crate) fn try_load_bin<R: Read>(mut input: R) -> io::Result<Vec<(Vec<Syllable>, Phrase)>> {
70    let mut buf = [0_u8; BIN_FIELD_SIZE];
71
72    input.read_exact(&mut buf[0..BIN_HASH_SIG.len()])?;
73    if !buf.starts_with(BIN_HASH_SIG.as_bytes()) {
74        return Err(invalid_data());
75    }
76    // NB: lifetime size is platform dependent
77    input.read_exact(&mut buf[0..size_of::<c_int>()])?;
78
79    let mut result = Vec::new();
80    loop {
81        if input.read_exact(&mut buf).is_err() {
82            break;
83        }
84
85        // NB: other integers are also platform dependent
86        let user_freq: i32 = i32::from_ne_bytes(buf[0..4].try_into().unwrap());
87        let recent_time: i32 = i32::from_ne_bytes(buf[4..8].try_into().unwrap());
88        let _max_freq: i32 = i32::from_ne_bytes(buf[8..12].try_into().unwrap());
89        let _orig_freq: i32 = i32::from_ne_bytes(buf[12..16].try_into().unwrap());
90
91        // Due to a bug in 0.3.5, some userphrase has negative frequency value.
92        // In this case, we just skip this record.
93        //
94        // See https://github.com/chewing/libchewing/issues/75
95        if user_freq < 0 || recent_time < 0 || _max_freq < 0 || _orig_freq < 0 {
96            continue;
97        }
98
99        let len = buf[16] as usize;
100        // addressing the start of phrase_str
101        if buf[17 + (2 * len) + 1] == 0 {
102            // This record is removed
103            continue;
104        }
105        let mut syllables: Vec<Syllable> = Vec::new();
106        let mut base = 17;
107        for _ in 0..len {
108            let syl_u16 = u16::from_ne_bytes(buf[base..base + 2].try_into().unwrap());
109            syllables.push(syl_u16.try_into().or(Err(invalid_data()))?);
110            base += 2;
111        }
112        let bytes = buf[base] as usize;
113        let phrase_str = str::from_utf8(&buf[base + 1..base + bytes + 1]);
114        if phrase_str.is_err() {
115            continue;
116        }
117
118        result.push((
119            syllables,
120            Phrase::new(phrase_str.unwrap(), user_freq as u32).with_time(recent_time as u64),
121        ));
122    }
123    Ok(result)
124}
125
126#[cfg(test)]
127mod tests {
128    use std::{ffi::c_int, mem::size_of};
129
130    use crate::zhuyin::Syllable;
131
132    use super::{BIN_FIELD_SIZE, Phrase, try_load_bin, try_load_text};
133
134    #[test]
135    fn load_valid_text() {
136        let input = b"42\nP 1 1 2 3 4\n";
137        let phrases = try_load_text(&input[..]).unwrap();
138        assert_eq!(
139            vec![(
140                vec![Syllable::try_from(1).unwrap()],
141                Phrase::from(("P", 1, 2))
142            )],
143            phrases
144        );
145    }
146
147    #[test]
148    fn load_truncated_text() {
149        let input = b"42\nPhrase 1 2 3 4 5 6\n";
150        let phrases = try_load_text(&input[..]);
151        assert!(phrases.is_err());
152    }
153
154    #[test]
155    fn load_malformed_text() {
156        let input = br#"<?xml version="1.0" encoding="UTF-8"?>\n"#;
157        let phrases = try_load_text(&input[..]);
158        assert!(phrases.is_err());
159    }
160
161    #[test]
162    fn load_binary_as_text() {
163        let input = b"CBiH\0\0\0\0";
164        let phrases = try_load_text(&input[..]);
165        assert!(phrases.is_err());
166    }
167
168    #[test]
169    fn load_valid_bin() {
170        let mut input = vec![b'C', b'B', b'i', b'H'];
171        input.extend_from_slice(&(0 as c_int).to_ne_bytes());
172        input.extend_from_slice(&1_i32.to_ne_bytes());
173        input.extend_from_slice(&2_i32.to_ne_bytes());
174        input.extend_from_slice(&3_i32.to_ne_bytes());
175        input.extend_from_slice(&4_i32.to_ne_bytes());
176        input.push(1);
177        input.extend_from_slice(&1_u16.to_ne_bytes());
178        input.push(1);
179        input.extend_from_slice(b"P");
180        input.extend(std::iter::repeat_n(
181            0,
182            BIN_FIELD_SIZE - input.len() + 4 + size_of::<c_int>(),
183        ));
184        let phrases = try_load_bin(&input[..]).unwrap();
185        assert_eq!(
186            vec![(
187                vec![Syllable::try_from(1).unwrap()],
188                Phrase::from(("P", 1, 2))
189            )],
190            phrases
191        );
192    }
193}