Skip to main content

dictx_parser/
dxdict.rs

1use crate::traits::{DictParser, ValidationReport};
2use dictx_core::{DictEntry, DictxError, Result};
3use serde::{Deserialize, Serialize};
4use std::collections::VecDeque;
5use std::fs::File;
6use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write};
7use std::path::Path;
8
9const MAGIC: &[u8; 8] = b"DXDICT01";
10const VERSION: u32 = 1;
11const COUNT_OFFSET: u64 = 12;
12
13pub const BUILTIN_NEW_CENTURY_SOURCE: &str = "builtin:new-century-han-eng";
14pub const BUILTIN_KD_DATA_SOURCE: &str = "builtin:kd-data";
15
16const BUILTIN_NEW_CENTURY_PACKS: &[&[u8]] = &[
17    dictx_data_new_century_1::BYTES,
18    dictx_data_new_century_2::BYTES,
19];
20const BUILTIN_KD_DATA_PACKS: &[&[u8]] = &[
21    dictx_data_kd_1::BYTES,
22    dictx_data_kd_2::BYTES,
23    dictx_data_kd_3::BYTES,
24    dictx_data_kd_4::BYTES,
25];
26
27#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)]
28pub struct DxdictMetadata {
29    pub name: String,
30    pub display: Option<String>,
31    pub source_format: Option<String>,
32}
33
34pub struct DxdictParser;
35
36impl DxdictParser {
37    pub fn parse_bytes(
38        bytes: &'static [u8],
39    ) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
40        let cursor = Cursor::new(bytes);
41        let iter = DxdictIter::new(cursor)?;
42        Ok(Box::new(iter))
43    }
44
45    pub fn parse_packs(
46        packs: &'static [&'static [u8]],
47    ) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
48        let mut iters = VecDeque::new();
49        for bytes in packs {
50            iters.push_back(DxdictIter::new(Cursor::new(*bytes))?);
51        }
52        Ok(Box::new(MultiDxdictIter { iters }))
53    }
54}
55
56pub struct BuiltinDxdictParser;
57
58impl DictParser for BuiltinDxdictParser {
59    fn name(&self) -> &'static str {
60        "DictX Built-in Dictionary Pack"
61    }
62
63    fn format_id(&self) -> &'static str {
64        "builtin-dxdict"
65    }
66
67    fn validate(&self, path: &Path) -> Result<ValidationReport> {
68        let mut entry_count = 0usize;
69        for bytes in builtin_packs(path)? {
70            let (_, header) = read_header(Cursor::new(*bytes))?;
71            entry_count += header.entry_count as usize;
72        }
73        Ok(ValidationReport::ok(self.format_id(), Some(entry_count)))
74    }
75
76    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
77        DxdictParser::parse_packs(builtin_packs(path)?)
78    }
79}
80
81impl DictParser for DxdictParser {
82    fn name(&self) -> &'static str {
83        "DictX Dictionary Pack"
84    }
85
86    fn format_id(&self) -> &'static str {
87        "dxdict"
88    }
89
90    fn validate(&self, path: &Path) -> Result<ValidationReport> {
91        let file = File::open(path)?;
92        let (_, header) = read_header(BufReader::new(file))?;
93        Ok(ValidationReport::ok(
94            self.format_id(),
95            Some(header.entry_count as usize),
96        ))
97    }
98
99    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
100        let file = File::open(path)?;
101        let iter = DxdictIter::new(BufReader::new(file))?;
102        Ok(Box::new(iter))
103    }
104}
105
106pub fn write_dxdict<I>(path: &Path, metadata: &DxdictMetadata, entries: I) -> Result<usize>
107where
108    I: IntoIterator<Item = Result<DictEntry>>,
109{
110    let mut file = File::create(path)?;
111    file.write_all(MAGIC)?;
112    file.write_all(&VERSION.to_le_bytes())?;
113    file.write_all(&0u64.to_le_bytes())?;
114
115    let metadata_bytes = serde_json::to_vec(metadata)?;
116    file.write_all(&(metadata_bytes.len() as u32).to_le_bytes())?;
117    file.write_all(&metadata_bytes)?;
118
119    let mut count = 0u64;
120    for entry in entries {
121        let entry = entry?;
122        let bytes = rmp_serde::to_vec_named(&entry)
123            .map_err(|err| DictxError::InvalidData(format!("DXDICT 序列化失败: {err}")))?;
124        file.write_all(&(bytes.len() as u32).to_le_bytes())?;
125        file.write_all(&bytes)?;
126        count += 1;
127    }
128
129    file.seek(SeekFrom::Start(COUNT_OFFSET))?;
130    file.write_all(&count.to_le_bytes())?;
131    file.flush()?;
132    Ok(count as usize)
133}
134
135fn builtin_packs(path: &Path) -> Result<&'static [&'static [u8]]> {
136    let source = path.to_string_lossy();
137    match source.as_ref() {
138        BUILTIN_NEW_CENTURY_SOURCE => Ok(BUILTIN_NEW_CENTURY_PACKS),
139        BUILTIN_KD_DATA_SOURCE => Ok(BUILTIN_KD_DATA_PACKS),
140        other => Err(DictxError::InvalidData(format!("未知内置词库: {other}"))),
141    }
142}
143
144#[derive(Debug)]
145struct DxdictHeader {
146    entry_count: u64,
147}
148
149struct DxdictIter<R: Read> {
150    reader: R,
151    remaining: u64,
152}
153
154struct MultiDxdictIter {
155    iters: VecDeque<DxdictIter<Cursor<&'static [u8]>>>,
156}
157
158impl Iterator for MultiDxdictIter {
159    type Item = Result<DictEntry>;
160
161    fn next(&mut self) -> Option<Self::Item> {
162        loop {
163            let iter = self.iters.front_mut()?;
164            if let Some(entry) = iter.next() {
165                return Some(entry);
166            }
167            self.iters.pop_front();
168        }
169    }
170}
171
172impl<R: Read> DxdictIter<R> {
173    fn new(reader: R) -> Result<Self> {
174        let (reader, header) = read_header(reader)?;
175        Ok(Self {
176            reader,
177            remaining: header.entry_count,
178        })
179    }
180}
181
182impl<R: Read> Iterator for DxdictIter<R> {
183    type Item = Result<DictEntry>;
184
185    fn next(&mut self) -> Option<Self::Item> {
186        if self.remaining == 0 {
187            return None;
188        }
189        self.remaining -= 1;
190
191        let mut len_buf = [0u8; 4];
192        if let Err(err) = self.reader.read_exact(&mut len_buf) {
193            return Some(Err(err.into()));
194        }
195        let len = u32::from_le_bytes(len_buf) as usize;
196        let mut bytes = vec![0u8; len];
197        if let Err(err) = self.reader.read_exact(&mut bytes) {
198            return Some(Err(err.into()));
199        }
200        Some(
201            rmp_serde::from_slice(&bytes)
202                .map_err(|err| DictxError::InvalidData(format!("DXDICT 反序列化失败: {err}"))),
203        )
204    }
205}
206
207fn read_header<R: Read>(mut reader: R) -> Result<(R, DxdictHeader)> {
208    let mut magic = [0u8; 8];
209    reader.read_exact(&mut magic)?;
210    if &magic != MAGIC {
211        return Err(DictxError::InvalidData("DXDICT magic 不匹配".to_string()));
212    }
213
214    let version = read_u32(&mut reader)?;
215    if version != VERSION {
216        return Err(DictxError::InvalidData(format!(
217            "不支持的 DXDICT 版本: {version}"
218        )));
219    }
220
221    let entry_count = read_u64(&mut reader)?;
222    let metadata_len = read_u32(&mut reader)? as usize;
223    let mut metadata = vec![0u8; metadata_len];
224    reader.read_exact(&mut metadata)?;
225    let _: DxdictMetadata = serde_json::from_slice(&metadata)?;
226
227    Ok((reader, DxdictHeader { entry_count }))
228}
229
230fn read_u32(reader: &mut impl Read) -> Result<u32> {
231    let mut buf = [0u8; 4];
232    reader.read_exact(&mut buf)?;
233    Ok(u32::from_le_bytes(buf))
234}
235
236fn read_u64(reader: &mut impl Read) -> Result<u64> {
237    let mut buf = [0u8; 8];
238    reader.read_exact(&mut buf)?;
239    Ok(u64::from_le_bytes(buf))
240}
241
242#[cfg(test)]
243mod tests {
244    use super::*;
245    use dictx_core::{Definition, DictSource};
246
247    #[test]
248    fn roundtrips_dxdict_pack() {
249        let dir = tempfile::tempdir().unwrap();
250        let path = dir.path().join("test.dxdict");
251        let mut entry = DictEntry::new(DictSource::Custom { name: "t".into() }, "apple");
252        entry
253            .definitions
254            .push(Definition::new("fruit", "苹果", Some("n".into())));
255
256        let count = write_dxdict(
257            &path,
258            &DxdictMetadata {
259                name: "test".into(),
260                display: Some("Test".into()),
261                source_format: Some("unit".into()),
262            },
263            vec![Ok(entry)],
264        )
265        .unwrap();
266        assert_eq!(count, 1);
267
268        let parser = DxdictParser;
269        let report = parser.validate(&path).unwrap();
270        assert!(report.valid);
271        assert_eq!(report.estimated_entries, Some(1));
272
273        let entries = parser
274            .parse(&path)
275            .unwrap()
276            .collect::<Result<Vec<_>>>()
277            .unwrap();
278        assert_eq!(entries[0].word, "apple");
279        assert_eq!(entries[0].definitions[0].zh, "苹果");
280    }
281
282    #[test]
283    fn validates_builtin_new_century_pack() {
284        let parser = BuiltinDxdictParser;
285        let path = Path::new(BUILTIN_NEW_CENTURY_SOURCE);
286
287        let report = parser.validate(path).unwrap();
288        assert!(report.valid);
289        assert!(report.estimated_entries.unwrap_or_default() > 80000);
290
291        let mut entries = parser.parse(path).unwrap();
292        let first = entries.next().unwrap().unwrap();
293        assert!(!first.word.is_empty());
294    }
295
296    #[test]
297    fn validates_builtin_kd_pack() {
298        let parser = BuiltinDxdictParser;
299        let path = Path::new(BUILTIN_KD_DATA_SOURCE);
300
301        let report = parser.validate(path).unwrap();
302        assert!(report.valid);
303        assert!(report.estimated_entries.unwrap_or_default() > 80000);
304    }
305}