Skip to main content

dictx_parser/
cedict.rs

1use crate::traits::{DictParser, ValidationReport};
2use dictx_core::{clean_text, Definition, DictEntry, DictSource, Result};
3use serde_json::json;
4use std::fs::File;
5use std::io::{BufRead, BufReader, Lines};
6use std::path::Path;
7
8pub struct CedictParser;
9
10impl DictParser for CedictParser {
11    fn name(&self) -> &'static str {
12        "CC-CEDICT"
13    }
14
15    fn format_id(&self) -> &'static str {
16        "cedict"
17    }
18
19    fn validate(&self, path: &Path) -> Result<ValidationReport> {
20        let file = File::open(path)?;
21        let reader = BufReader::new(file);
22        let mut valid_sample = false;
23        let mut count = 0usize;
24
25        for line in reader.lines() {
26            let line = line?;
27            let line = line.trim();
28            if line.is_empty() || line.starts_with('#') {
29                continue;
30            }
31            count += 1;
32            if parse_line(line).is_some() {
33                valid_sample = true;
34            }
35            if count > 100 && valid_sample {
36                break;
37            }
38        }
39
40        if valid_sample {
41            Ok(ValidationReport::ok(self.format_id(), Some(count)))
42        } else {
43            Ok(ValidationReport::invalid(
44                self.format_id(),
45                "未识别到 CC-CEDICT 格式条目",
46            ))
47        }
48    }
49
50    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
51        let file = File::open(path)?;
52        Ok(Box::new(CedictIter {
53            lines: BufReader::new(file).lines(),
54        }))
55    }
56}
57
58struct CedictIter {
59    lines: Lines<BufReader<File>>,
60}
61
62impl Iterator for CedictIter {
63    type Item = Result<DictEntry>;
64
65    fn next(&mut self) -> Option<Self::Item> {
66        for line in self.lines.by_ref() {
67            match line {
68                Ok(line) => {
69                    let line = line.trim();
70                    if line.is_empty() || line.starts_with('#') {
71                        continue;
72                    }
73                    if let Some(entry) = parse_line(line) {
74                        return Some(Ok(entry));
75                    }
76                }
77                Err(err) => return Some(Err(err.into())),
78            }
79        }
80        None
81    }
82}
83
84fn parse_line(line: &str) -> Option<DictEntry> {
85    let first_space = line.find(' ')?;
86    let traditional = &line[..first_space];
87    let rest = line[first_space + 1..].trim_start();
88    let second_space = rest.find(' ')?;
89    let simplified = &rest[..second_space];
90    let rest = rest[second_space + 1..].trim_start();
91    let pinyin_start = rest.find('[')?;
92    let pinyin_end = rest.find(']')?;
93    if pinyin_end <= pinyin_start {
94        return None;
95    }
96    let pinyin = &rest[pinyin_start + 1..pinyin_end];
97    let definitions = rest[pinyin_end + 1..].trim();
98    let definitions = definitions.strip_prefix('/')?.strip_suffix('/')?;
99
100    let mut entry = DictEntry::new(
101        DictSource::Custom {
102            name: "cc-cedict".to_string(),
103        },
104        clean_text(simplified),
105    );
106    entry.phonetic_uk = Some(clean_text(pinyin));
107    entry.definitions = definitions
108        .split('/')
109        .map(clean_text)
110        .filter(|definition| !definition.is_empty())
111        .map(|definition| Definition::new(definition, simplified, None))
112        .collect();
113    entry.tags = vec!["cc-cedict".to_string()];
114    entry.extra = json!({
115        "traditional": traditional,
116        "pinyin": pinyin,
117        "license": "CC BY-SA",
118    });
119
120    if entry.definitions.is_empty() {
121        None
122    } else {
123        Some(entry)
124    }
125}
126
127#[cfg(test)]
128mod tests {
129    use super::*;
130    use std::io::Write;
131
132    #[test]
133    fn parses_cedict_line() {
134        let entry = parse_line("老師 老师 [lao3 shi1] /teacher/CL:個|个[ge4]/").unwrap();
135        assert_eq!(entry.word, "老师");
136        assert_eq!(entry.phonetic(), Some("lao3 shi1"));
137        assert_eq!(entry.definitions[0].en, "teacher");
138    }
139
140    #[test]
141    fn parses_cedict_file() {
142        let mut file = tempfile::NamedTempFile::new().unwrap();
143        writeln!(file, "# comment").unwrap();
144        writeln!(file, "老師 老师 [lao3 shi1] /teacher/").unwrap();
145
146        let entries = CedictParser
147            .parse(file.path())
148            .unwrap()
149            .collect::<Result<Vec<_>>>()
150            .unwrap();
151
152        assert_eq!(entries.len(), 1);
153        assert_eq!(entries[0].word, "老师");
154    }
155}