dictx-parser 0.1.1

Dictionary source parsers for DictX.
Documentation
use crate::traits::{DictParser, ValidationReport};
use dictx_core::{DictEntry, DictxError, Result};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs::File;
use std::io::{BufReader, Cursor, Read, Seek, SeekFrom, Write};
use std::path::Path;

const MAGIC: &[u8; 8] = b"DXDICT01";
const VERSION: u32 = 1;
const COUNT_OFFSET: u64 = 12;

pub const BUILTIN_NEW_CENTURY_SOURCE: &str = "builtin:new-century-han-eng";
pub const BUILTIN_KD_DATA_SOURCE: &str = "builtin:kd-data";

const BUILTIN_NEW_CENTURY_PACKS: &[&[u8]] = &[
    dictx_data_new_century_1::BYTES,
    dictx_data_new_century_2::BYTES,
];
const BUILTIN_KD_DATA_PACKS: &[&[u8]] = &[
    dictx_data_kd_1::BYTES,
    dictx_data_kd_2::BYTES,
    dictx_data_kd_3::BYTES,
    dictx_data_kd_4::BYTES,
];

#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)]
pub struct DxdictMetadata {
    pub name: String,
    pub display: Option<String>,
    pub source_format: Option<String>,
}

pub struct DxdictParser;

impl DxdictParser {
    pub fn parse_bytes(
        bytes: &'static [u8],
    ) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
        let cursor = Cursor::new(bytes);
        let iter = DxdictIter::new(cursor)?;
        Ok(Box::new(iter))
    }

    pub fn parse_packs(
        packs: &'static [&'static [u8]],
    ) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
        let mut iters = VecDeque::new();
        for bytes in packs {
            iters.push_back(DxdictIter::new(Cursor::new(*bytes))?);
        }
        Ok(Box::new(MultiDxdictIter { iters }))
    }
}

pub struct BuiltinDxdictParser;

impl DictParser for BuiltinDxdictParser {
    fn name(&self) -> &'static str {
        "DictX Built-in Dictionary Pack"
    }

    fn format_id(&self) -> &'static str {
        "builtin-dxdict"
    }

    fn validate(&self, path: &Path) -> Result<ValidationReport> {
        let mut entry_count = 0usize;
        for bytes in builtin_packs(path)? {
            let (_, header) = read_header(Cursor::new(*bytes))?;
            entry_count += header.entry_count as usize;
        }
        Ok(ValidationReport::ok(self.format_id(), Some(entry_count)))
    }

    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
        DxdictParser::parse_packs(builtin_packs(path)?)
    }
}

impl DictParser for DxdictParser {
    fn name(&self) -> &'static str {
        "DictX Dictionary Pack"
    }

    fn format_id(&self) -> &'static str {
        "dxdict"
    }

    fn validate(&self, path: &Path) -> Result<ValidationReport> {
        let file = File::open(path)?;
        let (_, header) = read_header(BufReader::new(file))?;
        Ok(ValidationReport::ok(
            self.format_id(),
            Some(header.entry_count as usize),
        ))
    }

    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
        let file = File::open(path)?;
        let iter = DxdictIter::new(BufReader::new(file))?;
        Ok(Box::new(iter))
    }
}

pub fn write_dxdict<I>(path: &Path, metadata: &DxdictMetadata, entries: I) -> Result<usize>
where
    I: IntoIterator<Item = Result<DictEntry>>,
{
    let mut file = File::create(path)?;
    file.write_all(MAGIC)?;
    file.write_all(&VERSION.to_le_bytes())?;
    file.write_all(&0u64.to_le_bytes())?;

    let metadata_bytes = serde_json::to_vec(metadata)?;
    file.write_all(&(metadata_bytes.len() as u32).to_le_bytes())?;
    file.write_all(&metadata_bytes)?;

    let mut count = 0u64;
    for entry in entries {
        let entry = entry?;
        let bytes = rmp_serde::to_vec_named(&entry)
            .map_err(|err| DictxError::InvalidData(format!("DXDICT 序列化失败: {err}")))?;
        file.write_all(&(bytes.len() as u32).to_le_bytes())?;
        file.write_all(&bytes)?;
        count += 1;
    }

    file.seek(SeekFrom::Start(COUNT_OFFSET))?;
    file.write_all(&count.to_le_bytes())?;
    file.flush()?;
    Ok(count as usize)
}

fn builtin_packs(path: &Path) -> Result<&'static [&'static [u8]]> {
    let source = path.to_string_lossy();
    match source.as_ref() {
        BUILTIN_NEW_CENTURY_SOURCE => Ok(BUILTIN_NEW_CENTURY_PACKS),
        BUILTIN_KD_DATA_SOURCE => Ok(BUILTIN_KD_DATA_PACKS),
        other => Err(DictxError::InvalidData(format!("未知内置词库: {other}"))),
    }
}

#[derive(Debug)]
struct DxdictHeader {
    entry_count: u64,
}

struct DxdictIter<R: Read> {
    reader: R,
    remaining: u64,
}

struct MultiDxdictIter {
    iters: VecDeque<DxdictIter<Cursor<&'static [u8]>>>,
}

impl Iterator for MultiDxdictIter {
    type Item = Result<DictEntry>;

    fn next(&mut self) -> Option<Self::Item> {
        loop {
            let iter = self.iters.front_mut()?;
            if let Some(entry) = iter.next() {
                return Some(entry);
            }
            self.iters.pop_front();
        }
    }
}

impl<R: Read> DxdictIter<R> {
    fn new(reader: R) -> Result<Self> {
        let (reader, header) = read_header(reader)?;
        Ok(Self {
            reader,
            remaining: header.entry_count,
        })
    }
}

impl<R: Read> Iterator for DxdictIter<R> {
    type Item = Result<DictEntry>;

    fn next(&mut self) -> Option<Self::Item> {
        if self.remaining == 0 {
            return None;
        }
        self.remaining -= 1;

        let mut len_buf = [0u8; 4];
        if let Err(err) = self.reader.read_exact(&mut len_buf) {
            return Some(Err(err.into()));
        }
        let len = u32::from_le_bytes(len_buf) as usize;
        let mut bytes = vec![0u8; len];
        if let Err(err) = self.reader.read_exact(&mut bytes) {
            return Some(Err(err.into()));
        }
        Some(
            rmp_serde::from_slice(&bytes)
                .map_err(|err| DictxError::InvalidData(format!("DXDICT 反序列化失败: {err}"))),
        )
    }
}

fn read_header<R: Read>(mut reader: R) -> Result<(R, DxdictHeader)> {
    let mut magic = [0u8; 8];
    reader.read_exact(&mut magic)?;
    if &magic != MAGIC {
        return Err(DictxError::InvalidData("DXDICT magic 不匹配".to_string()));
    }

    let version = read_u32(&mut reader)?;
    if version != VERSION {
        return Err(DictxError::InvalidData(format!(
            "不支持的 DXDICT 版本: {version}"
        )));
    }

    let entry_count = read_u64(&mut reader)?;
    let metadata_len = read_u32(&mut reader)? as usize;
    let mut metadata = vec![0u8; metadata_len];
    reader.read_exact(&mut metadata)?;
    let _: DxdictMetadata = serde_json::from_slice(&metadata)?;

    Ok((reader, DxdictHeader { entry_count }))
}

fn read_u32(reader: &mut impl Read) -> Result<u32> {
    let mut buf = [0u8; 4];
    reader.read_exact(&mut buf)?;
    Ok(u32::from_le_bytes(buf))
}

fn read_u64(reader: &mut impl Read) -> Result<u64> {
    let mut buf = [0u8; 8];
    reader.read_exact(&mut buf)?;
    Ok(u64::from_le_bytes(buf))
}

#[cfg(test)]
mod tests {
    use super::*;
    use dictx_core::{Definition, DictSource};

    #[test]
    fn roundtrips_dxdict_pack() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("test.dxdict");
        let mut entry = DictEntry::new(DictSource::Custom { name: "t".into() }, "apple");
        entry
            .definitions
            .push(Definition::new("fruit", "苹果", Some("n".into())));

        let count = write_dxdict(
            &path,
            &DxdictMetadata {
                name: "test".into(),
                display: Some("Test".into()),
                source_format: Some("unit".into()),
            },
            vec![Ok(entry)],
        )
        .unwrap();
        assert_eq!(count, 1);

        let parser = DxdictParser;
        let report = parser.validate(&path).unwrap();
        assert!(report.valid);
        assert_eq!(report.estimated_entries, Some(1));

        let entries = parser
            .parse(&path)
            .unwrap()
            .collect::<Result<Vec<_>>>()
            .unwrap();
        assert_eq!(entries[0].word, "apple");
        assert_eq!(entries[0].definitions[0].zh, "苹果");
    }

    #[test]
    fn validates_builtin_new_century_pack() {
        let parser = BuiltinDxdictParser;
        let path = Path::new(BUILTIN_NEW_CENTURY_SOURCE);

        let report = parser.validate(path).unwrap();
        assert!(report.valid);
        assert!(report.estimated_entries.unwrap_or_default() > 80000);

        let mut entries = parser.parse(path).unwrap();
        let first = entries.next().unwrap().unwrap();
        assert!(!first.word.is_empty());
    }

    #[test]
    fn validates_builtin_kd_pack() {
        let parser = BuiltinDxdictParser;
        let path = Path::new(BUILTIN_KD_DATA_SOURCE);

        let report = parser.validate(path).unwrap();
        assert!(report.valid);
        assert!(report.estimated_entries.unwrap_or_default() > 80000);
    }
}