1use crate::traits::{DictParser, ValidationReport};
2use dictx_core::{DictEntry, DictxError, Result};
3use serde::{Deserialize, Serialize};
4use std::collections::VecDeque;
5use std::fs::File;
6use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write};
7use std::path::Path;
8
9const MAGIC: &[u8; 8] = b"DXDICT01";
10const VERSION: u32 = 1;
11const COUNT_OFFSET: u64 = 12;
12
13pub const BUILTIN_NEW_CENTURY_SOURCE: &str = "builtin:new-century-han-eng";
14pub const BUILTIN_KD_DATA_SOURCE: &str = "builtin:kd-data";
15
16const BUILTIN_NEW_CENTURY_PACKS: &[&[u8]] = &[
17 dictx_data_new_century_1::BYTES,
18 dictx_data_new_century_2::BYTES,
19];
20const BUILTIN_KD_DATA_PACKS: &[&[u8]] = &[
21 dictx_data_kd_1::BYTES,
22 dictx_data_kd_2::BYTES,
23 dictx_data_kd_3::BYTES,
24 dictx_data_kd_4::BYTES,
25];
26
27#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)]
28pub struct DxdictMetadata {
29 pub name: String,
30 pub display: Option<String>,
31 pub source_format: Option<String>,
32}
33
34pub struct DxdictParser;
35
36impl DxdictParser {
37 pub fn parse_bytes(
38 bytes: &'static [u8],
39 ) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
40 let cursor = Cursor::new(bytes);
41 let iter = DxdictIter::new(cursor)?;
42 Ok(Box::new(iter))
43 }
44
45 pub fn parse_packs(
46 packs: &'static [&'static [u8]],
47 ) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
48 let mut iters = VecDeque::new();
49 for bytes in packs {
50 iters.push_back(DxdictIter::new(Cursor::new(*bytes))?);
51 }
52 Ok(Box::new(MultiDxdictIter { iters }))
53 }
54}
55
56pub struct BuiltinDxdictParser;
57
58impl DictParser for BuiltinDxdictParser {
59 fn name(&self) -> &'static str {
60 "DictX Built-in Dictionary Pack"
61 }
62
63 fn format_id(&self) -> &'static str {
64 "builtin-dxdict"
65 }
66
67 fn validate(&self, path: &Path) -> Result<ValidationReport> {
68 let mut entry_count = 0usize;
69 for bytes in builtin_packs(path)? {
70 let (_, header) = read_header(Cursor::new(*bytes))?;
71 entry_count += header.entry_count as usize;
72 }
73 Ok(ValidationReport::ok(self.format_id(), Some(entry_count)))
74 }
75
76 fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
77 DxdictParser::parse_packs(builtin_packs(path)?)
78 }
79}
80
81impl DictParser for DxdictParser {
82 fn name(&self) -> &'static str {
83 "DictX Dictionary Pack"
84 }
85
86 fn format_id(&self) -> &'static str {
87 "dxdict"
88 }
89
90 fn validate(&self, path: &Path) -> Result<ValidationReport> {
91 let file = File::open(path)?;
92 let (_, header) = read_header(BufReader::new(file))?;
93 Ok(ValidationReport::ok(
94 self.format_id(),
95 Some(header.entry_count as usize),
96 ))
97 }
98
99 fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
100 let file = File::open(path)?;
101 let iter = DxdictIter::new(BufReader::new(file))?;
102 Ok(Box::new(iter))
103 }
104}
105
106pub fn write_dxdict<I>(path: &Path, metadata: &DxdictMetadata, entries: I) -> Result<usize>
107where
108 I: IntoIterator<Item = Result<DictEntry>>,
109{
110 let mut file = File::create(path)?;
111 file.write_all(MAGIC)?;
112 file.write_all(&VERSION.to_le_bytes())?;
113 file.write_all(&0u64.to_le_bytes())?;
114
115 let metadata_bytes = serde_json::to_vec(metadata)?;
116 file.write_all(&(metadata_bytes.len() as u32).to_le_bytes())?;
117 file.write_all(&metadata_bytes)?;
118
119 let mut count = 0u64;
120 for entry in entries {
121 let entry = entry?;
122 let bytes = rmp_serde::to_vec_named(&entry)
123 .map_err(|err| DictxError::InvalidData(format!("DXDICT 序列化失败: {err}")))?;
124 file.write_all(&(bytes.len() as u32).to_le_bytes())?;
125 file.write_all(&bytes)?;
126 count += 1;
127 }
128
129 file.seek(SeekFrom::Start(COUNT_OFFSET))?;
130 file.write_all(&count.to_le_bytes())?;
131 file.flush()?;
132 Ok(count as usize)
133}
134
135fn builtin_packs(path: &Path) -> Result<&'static [&'static [u8]]> {
136 let source = path.to_string_lossy();
137 match source.as_ref() {
138 BUILTIN_NEW_CENTURY_SOURCE => Ok(BUILTIN_NEW_CENTURY_PACKS),
139 BUILTIN_KD_DATA_SOURCE => Ok(BUILTIN_KD_DATA_PACKS),
140 other => Err(DictxError::InvalidData(format!("未知内置词库: {other}"))),
141 }
142}
143
144#[derive(Debug)]
145struct DxdictHeader {
146 entry_count: u64,
147}
148
149struct DxdictIter<R: Read> {
150 reader: R,
151 remaining: u64,
152}
153
154struct MultiDxdictIter {
155 iters: VecDeque<DxdictIter<Cursor<&'static [u8]>>>,
156}
157
158impl Iterator for MultiDxdictIter {
159 type Item = Result<DictEntry>;
160
161 fn next(&mut self) -> Option<Self::Item> {
162 loop {
163 let iter = self.iters.front_mut()?;
164 if let Some(entry) = iter.next() {
165 return Some(entry);
166 }
167 self.iters.pop_front();
168 }
169 }
170}
171
172impl<R: Read> DxdictIter<R> {
173 fn new(reader: R) -> Result<Self> {
174 let (reader, header) = read_header(reader)?;
175 Ok(Self {
176 reader,
177 remaining: header.entry_count,
178 })
179 }
180}
181
182impl<R: Read> Iterator for DxdictIter<R> {
183 type Item = Result<DictEntry>;
184
185 fn next(&mut self) -> Option<Self::Item> {
186 if self.remaining == 0 {
187 return None;
188 }
189 self.remaining -= 1;
190
191 let mut len_buf = [0u8; 4];
192 if let Err(err) = self.reader.read_exact(&mut len_buf) {
193 return Some(Err(err.into()));
194 }
195 let len = u32::from_le_bytes(len_buf) as usize;
196 let mut bytes = vec![0u8; len];
197 if let Err(err) = self.reader.read_exact(&mut bytes) {
198 return Some(Err(err.into()));
199 }
200 Some(
201 rmp_serde::from_slice(&bytes)
202 .map_err(|err| DictxError::InvalidData(format!("DXDICT 反序列化失败: {err}"))),
203 )
204 }
205}
206
207fn read_header<R: Read>(mut reader: R) -> Result<(R, DxdictHeader)> {
208 let mut magic = [0u8; 8];
209 reader.read_exact(&mut magic)?;
210 if &magic != MAGIC {
211 return Err(DictxError::InvalidData("DXDICT magic 不匹配".to_string()));
212 }
213
214 let version = read_u32(&mut reader)?;
215 if version != VERSION {
216 return Err(DictxError::InvalidData(format!(
217 "不支持的 DXDICT 版本: {version}"
218 )));
219 }
220
221 let entry_count = read_u64(&mut reader)?;
222 let metadata_len = read_u32(&mut reader)? as usize;
223 let mut metadata = vec![0u8; metadata_len];
224 reader.read_exact(&mut metadata)?;
225 let _: DxdictMetadata = serde_json::from_slice(&metadata)?;
226
227 Ok((reader, DxdictHeader { entry_count }))
228}
229
230fn read_u32(reader: &mut impl Read) -> Result<u32> {
231 let mut buf = [0u8; 4];
232 reader.read_exact(&mut buf)?;
233 Ok(u32::from_le_bytes(buf))
234}
235
236fn read_u64(reader: &mut impl Read) -> Result<u64> {
237 let mut buf = [0u8; 8];
238 reader.read_exact(&mut buf)?;
239 Ok(u64::from_le_bytes(buf))
240}
241
242#[cfg(test)]
243mod tests {
244 use super::*;
245 use dictx_core::{Definition, DictSource};
246
247 #[test]
248 fn roundtrips_dxdict_pack() {
249 let dir = tempfile::tempdir().unwrap();
250 let path = dir.path().join("test.dxdict");
251 let mut entry = DictEntry::new(DictSource::Custom { name: "t".into() }, "apple");
252 entry
253 .definitions
254 .push(Definition::new("fruit", "苹果", Some("n".into())));
255
256 let count = write_dxdict(
257 &path,
258 &DxdictMetadata {
259 name: "test".into(),
260 display: Some("Test".into()),
261 source_format: Some("unit".into()),
262 },
263 vec![Ok(entry)],
264 )
265 .unwrap();
266 assert_eq!(count, 1);
267
268 let parser = DxdictParser;
269 let report = parser.validate(&path).unwrap();
270 assert!(report.valid);
271 assert_eq!(report.estimated_entries, Some(1));
272
273 let entries = parser
274 .parse(&path)
275 .unwrap()
276 .collect::<Result<Vec<_>>>()
277 .unwrap();
278 assert_eq!(entries[0].word, "apple");
279 assert_eq!(entries[0].definitions[0].zh, "苹果");
280 }
281
282 #[test]
283 fn validates_builtin_new_century_pack() {
284 let parser = BuiltinDxdictParser;
285 let path = Path::new(BUILTIN_NEW_CENTURY_SOURCE);
286
287 let report = parser.validate(path).unwrap();
288 assert!(report.valid);
289 assert!(report.estimated_entries.unwrap_or_default() > 80000);
290
291 let mut entries = parser.parse(path).unwrap();
292 let first = entries.next().unwrap().unwrap();
293 assert!(!first.word.is_empty());
294 }
295
296 #[test]
297 fn validates_builtin_kd_pack() {
298 let parser = BuiltinDxdictParser;
299 let path = Path::new(BUILTIN_KD_DATA_SOURCE);
300
301 let report = parser.validate(path).unwrap();
302 assert!(report.valid);
303 assert!(report.estimated_entries.unwrap_or_default() > 80000);
304 }
305}