dictx-parser 0.1.1

Dictionary source parsers for DictX.
Documentation
use crate::traits::{DictParser, ValidationReport};
use dictx_core::{clean_text, Definition, DictEntry, DictSource, Result};
use serde_json::json;
use std::fs::File;
use std::io::{BufRead, BufReader, Lines};
use std::path::Path;

pub struct CedictParser;

impl DictParser for CedictParser {
    fn name(&self) -> &'static str {
        "CC-CEDICT"
    }

    fn format_id(&self) -> &'static str {
        "cedict"
    }

    fn validate(&self, path: &Path) -> Result<ValidationReport> {
        let file = File::open(path)?;
        let reader = BufReader::new(file);
        let mut valid_sample = false;
        let mut count = 0usize;

        for line in reader.lines() {
            let line = line?;
            let line = line.trim();
            if line.is_empty() || line.starts_with('#') {
                continue;
            }
            count += 1;
            if parse_line(line).is_some() {
                valid_sample = true;
            }
            if count > 100 && valid_sample {
                break;
            }
        }

        if valid_sample {
            Ok(ValidationReport::ok(self.format_id(), Some(count)))
        } else {
            Ok(ValidationReport::invalid(
                self.format_id(),
                "未识别到 CC-CEDICT 格式条目",
            ))
        }
    }

    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
        let file = File::open(path)?;
        Ok(Box::new(CedictIter {
            lines: BufReader::new(file).lines(),
        }))
    }
}

struct CedictIter {
    lines: Lines<BufReader<File>>,
}

impl Iterator for CedictIter {
    type Item = Result<DictEntry>;

    fn next(&mut self) -> Option<Self::Item> {
        for line in self.lines.by_ref() {
            match line {
                Ok(line) => {
                    let line = line.trim();
                    if line.is_empty() || line.starts_with('#') {
                        continue;
                    }
                    if let Some(entry) = parse_line(line) {
                        return Some(Ok(entry));
                    }
                }
                Err(err) => return Some(Err(err.into())),
            }
        }
        None
    }
}

fn parse_line(line: &str) -> Option<DictEntry> {
    let first_space = line.find(' ')?;
    let traditional = &line[..first_space];
    let rest = line[first_space + 1..].trim_start();
    let second_space = rest.find(' ')?;
    let simplified = &rest[..second_space];
    let rest = rest[second_space + 1..].trim_start();
    let pinyin_start = rest.find('[')?;
    let pinyin_end = rest.find(']')?;
    if pinyin_end <= pinyin_start {
        return None;
    }
    let pinyin = &rest[pinyin_start + 1..pinyin_end];
    let definitions = rest[pinyin_end + 1..].trim();
    let definitions = definitions.strip_prefix('/')?.strip_suffix('/')?;

    let mut entry = DictEntry::new(
        DictSource::Custom {
            name: "cc-cedict".to_string(),
        },
        clean_text(simplified),
    );
    entry.phonetic_uk = Some(clean_text(pinyin));
    entry.definitions = definitions
        .split('/')
        .map(clean_text)
        .filter(|definition| !definition.is_empty())
        .map(|definition| Definition::new(definition, simplified, None))
        .collect();
    entry.tags = vec!["cc-cedict".to_string()];
    entry.extra = json!({
        "traditional": traditional,
        "pinyin": pinyin,
        "license": "CC BY-SA",
    });

    if entry.definitions.is_empty() {
        None
    } else {
        Some(entry)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;

    #[test]
    fn parses_cedict_line() {
        let entry = parse_line("老師 老师 [lao3 shi1] /teacher/CL:個|个[ge4]/").unwrap();
        assert_eq!(entry.word, "老师");
        assert_eq!(entry.phonetic(), Some("lao3 shi1"));
        assert_eq!(entry.definitions[0].en, "teacher");
    }

    #[test]
    fn parses_cedict_file() {
        let mut file = tempfile::NamedTempFile::new().unwrap();
        writeln!(file, "# comment").unwrap();
        writeln!(file, "老師 老师 [lao3 shi1] /teacher/").unwrap();

        let entries = CedictParser
            .parse(file.path())
            .unwrap()
            .collect::<Result<Vec<_>>>()
            .unwrap();

        assert_eq!(entries.len(), 1);
        assert_eq!(entries[0].word, "老师");
    }
}