use crate::traits::{DictParser, ValidationReport};
use dictx_core::{DictEntry, DictxError, Result};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs::File;
use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write};
use std::path::Path;
const MAGIC: &[u8; 8] = b"DXDICT01";
const VERSION: u32 = 1;
const COUNT_OFFSET: u64 = 12;
pub const BUILTIN_NEW_CENTURY_SOURCE: &str = "builtin:new-century-han-eng";
pub const BUILTIN_KD_DATA_SOURCE: &str = "builtin:kd-data";
const BUILTIN_NEW_CENTURY_PACKS: &[&[u8]] = &[
dictx_data_new_century_1::BYTES,
dictx_data_new_century_2::BYTES,
];
const BUILTIN_KD_DATA_PACKS: &[&[u8]] = &[
dictx_data_kd_1::BYTES,
dictx_data_kd_2::BYTES,
dictx_data_kd_3::BYTES,
dictx_data_kd_4::BYTES,
];
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)]
pub struct DxdictMetadata {
pub name: String,
pub display: Option<String>,
pub source_format: Option<String>,
}
pub struct DxdictParser;
impl DxdictParser {
pub fn parse_bytes(
bytes: &'static [u8],
) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
let cursor = Cursor::new(bytes);
let iter = DxdictIter::new(cursor)?;
Ok(Box::new(iter))
}
pub fn parse_packs(
packs: &'static [&'static [u8]],
) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
let mut iters = VecDeque::new();
for bytes in packs {
iters.push_back(DxdictIter::new(Cursor::new(*bytes))?);
}
Ok(Box::new(MultiDxdictIter { iters }))
}
}
pub struct BuiltinDxdictParser;
impl DictParser for BuiltinDxdictParser {
fn name(&self) -> &'static str {
"DictX Built-in Dictionary Pack"
}
fn format_id(&self) -> &'static str {
"builtin-dxdict"
}
fn validate(&self, path: &Path) -> Result<ValidationReport> {
let mut entry_count = 0usize;
for bytes in builtin_packs(path)? {
let (_, header) = read_header(Cursor::new(*bytes))?;
entry_count += header.entry_count as usize;
}
Ok(ValidationReport::ok(self.format_id(), Some(entry_count)))
}
fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
DxdictParser::parse_packs(builtin_packs(path)?)
}
}
impl DictParser for DxdictParser {
fn name(&self) -> &'static str {
"DictX Dictionary Pack"
}
fn format_id(&self) -> &'static str {
"dxdict"
}
fn validate(&self, path: &Path) -> Result<ValidationReport> {
let file = File::open(path)?;
let (_, header) = read_header(BufReader::new(file))?;
Ok(ValidationReport::ok(
self.format_id(),
Some(header.entry_count as usize),
))
}
fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
let file = File::open(path)?;
let iter = DxdictIter::new(BufReader::new(file))?;
Ok(Box::new(iter))
}
}
pub fn write_dxdict<I>(path: &Path, metadata: &DxdictMetadata, entries: I) -> Result<usize>
where
I: IntoIterator<Item = Result<DictEntry>>,
{
let mut file = File::create(path)?;
file.write_all(MAGIC)?;
file.write_all(&VERSION.to_le_bytes())?;
file.write_all(&0u64.to_le_bytes())?;
let metadata_bytes = serde_json::to_vec(metadata)?;
file.write_all(&(metadata_bytes.len() as u32).to_le_bytes())?;
file.write_all(&metadata_bytes)?;
let mut count = 0u64;
for entry in entries {
let entry = entry?;
let bytes = rmp_serde::to_vec_named(&entry)
.map_err(|err| DictxError::InvalidData(format!("DXDICT 序列化失败: {err}")))?;
file.write_all(&(bytes.len() as u32).to_le_bytes())?;
file.write_all(&bytes)?;
count += 1;
}
file.seek(SeekFrom::Start(COUNT_OFFSET))?;
file.write_all(&count.to_le_bytes())?;
file.flush()?;
Ok(count as usize)
}
fn builtin_packs(path: &Path) -> Result<&'static [&'static [u8]]> {
let source = path.to_string_lossy();
match source.as_ref() {
BUILTIN_NEW_CENTURY_SOURCE => Ok(BUILTIN_NEW_CENTURY_PACKS),
BUILTIN_KD_DATA_SOURCE => Ok(BUILTIN_KD_DATA_PACKS),
other => Err(DictxError::InvalidData(format!("未知内置词库: {other}"))),
}
}
#[derive(Debug)]
struct DxdictHeader {
entry_count: u64,
}
struct DxdictIter<R: Read> {
reader: R,
remaining: u64,
}
struct MultiDxdictIter {
iters: VecDeque<DxdictIter<Cursor<&'static [u8]>>>,
}
impl Iterator for MultiDxdictIter {
type Item = Result<DictEntry>;
fn next(&mut self) -> Option<Self::Item> {
loop {
let iter = self.iters.front_mut()?;
if let Some(entry) = iter.next() {
return Some(entry);
}
self.iters.pop_front();
}
}
}
impl<R: Read> DxdictIter<R> {
fn new(reader: R) -> Result<Self> {
let (reader, header) = read_header(reader)?;
Ok(Self {
reader,
remaining: header.entry_count,
})
}
}
impl<R: Read> Iterator for DxdictIter<R> {
type Item = Result<DictEntry>;
fn next(&mut self) -> Option<Self::Item> {
if self.remaining == 0 {
return None;
}
self.remaining -= 1;
let mut len_buf = [0u8; 4];
if let Err(err) = self.reader.read_exact(&mut len_buf) {
return Some(Err(err.into()));
}
let len = u32::from_le_bytes(len_buf) as usize;
let mut bytes = vec![0u8; len];
if let Err(err) = self.reader.read_exact(&mut bytes) {
return Some(Err(err.into()));
}
Some(
rmp_serde::from_slice(&bytes)
.map_err(|err| DictxError::InvalidData(format!("DXDICT 反序列化失败: {err}"))),
)
}
}
fn read_header<R: Read>(mut reader: R) -> Result<(R, DxdictHeader)> {
let mut magic = [0u8; 8];
reader.read_exact(&mut magic)?;
if &magic != MAGIC {
return Err(DictxError::InvalidData("DXDICT magic 不匹配".to_string()));
}
let version = read_u32(&mut reader)?;
if version != VERSION {
return Err(DictxError::InvalidData(format!(
"不支持的 DXDICT 版本: {version}"
)));
}
let entry_count = read_u64(&mut reader)?;
let metadata_len = read_u32(&mut reader)? as usize;
let mut metadata = vec![0u8; metadata_len];
reader.read_exact(&mut metadata)?;
let _: DxdictMetadata = serde_json::from_slice(&metadata)?;
Ok((reader, DxdictHeader { entry_count }))
}
fn read_u32(reader: &mut impl Read) -> Result<u32> {
let mut buf = [0u8; 4];
reader.read_exact(&mut buf)?;
Ok(u32::from_le_bytes(buf))
}
fn read_u64(reader: &mut impl Read) -> Result<u64> {
let mut buf = [0u8; 8];
reader.read_exact(&mut buf)?;
Ok(u64::from_le_bytes(buf))
}
#[cfg(test)]
mod tests {
use super::*;
use dictx_core::{Definition, DictSource};
#[test]
fn roundtrips_dxdict_pack() {
let dir = tempfile::tempdir().unwrap();
let path = dir.path().join("test.dxdict");
let mut entry = DictEntry::new(DictSource::Custom { name: "t".into() }, "apple");
entry
.definitions
.push(Definition::new("fruit", "苹果", Some("n".into())));
let count = write_dxdict(
&path,
&DxdictMetadata {
name: "test".into(),
display: Some("Test".into()),
source_format: Some("unit".into()),
},
vec![Ok(entry)],
)
.unwrap();
assert_eq!(count, 1);
let parser = DxdictParser;
let report = parser.validate(&path).unwrap();
assert!(report.valid);
assert_eq!(report.estimated_entries, Some(1));
let entries = parser
.parse(&path)
.unwrap()
.collect::<Result<Vec<_>>>()
.unwrap();
assert_eq!(entries[0].word, "apple");
assert_eq!(entries[0].definitions[0].zh, "苹果");
}
#[test]
fn validates_builtin_new_century_pack() {
let parser = BuiltinDxdictParser;
let path = Path::new(BUILTIN_NEW_CENTURY_SOURCE);
let report = parser.validate(path).unwrap();
assert!(report.valid);
assert!(report.estimated_entries.unwrap_or_default() > 80000);
let mut entries = parser.parse(path).unwrap();
let first = entries.next().unwrap().unwrap();
assert!(!first.word.is_empty());
}
#[test]
fn validates_builtin_kd_pack() {
let parser = BuiltinDxdictParser;
let path = Path::new(BUILTIN_KD_DATA_SOURCE);
let report = parser.validate(path).unwrap();
assert!(report.valid);
assert!(report.estimated_entries.unwrap_or_default() > 80000);
}
}