1use crate::traits::{DictParser, ValidationReport};
2use dictx_core::{clean_text, Definition, DictEntry, DictSource, Result};
3use serde_json::json;
4use std::fs::File;
5use std::io::{BufRead, BufReader, Lines};
6use std::path::Path;
7
8pub struct CedictParser;
9
10impl DictParser for CedictParser {
11 fn name(&self) -> &'static str {
12 "CC-CEDICT"
13 }
14
15 fn format_id(&self) -> &'static str {
16 "cedict"
17 }
18
19 fn validate(&self, path: &Path) -> Result<ValidationReport> {
20 let file = File::open(path)?;
21 let reader = BufReader::new(file);
22 let mut valid_sample = false;
23 let mut count = 0usize;
24
25 for line in reader.lines() {
26 let line = line?;
27 let line = line.trim();
28 if line.is_empty() || line.starts_with('#') {
29 continue;
30 }
31 count += 1;
32 if parse_line(line).is_some() {
33 valid_sample = true;
34 }
35 if count > 100 && valid_sample {
36 break;
37 }
38 }
39
40 if valid_sample {
41 Ok(ValidationReport::ok(self.format_id(), Some(count)))
42 } else {
43 Ok(ValidationReport::invalid(
44 self.format_id(),
45 "未识别到 CC-CEDICT 格式条目",
46 ))
47 }
48 }
49
50 fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
51 let file = File::open(path)?;
52 Ok(Box::new(CedictIter {
53 lines: BufReader::new(file).lines(),
54 }))
55 }
56}
57
58struct CedictIter {
59 lines: Lines<BufReader<File>>,
60}
61
62impl Iterator for CedictIter {
63 type Item = Result<DictEntry>;
64
65 fn next(&mut self) -> Option<Self::Item> {
66 for line in self.lines.by_ref() {
67 match line {
68 Ok(line) => {
69 let line = line.trim();
70 if line.is_empty() || line.starts_with('#') {
71 continue;
72 }
73 if let Some(entry) = parse_line(line) {
74 return Some(Ok(entry));
75 }
76 }
77 Err(err) => return Some(Err(err.into())),
78 }
79 }
80 None
81 }
82}
83
84fn parse_line(line: &str) -> Option<DictEntry> {
85 let first_space = line.find(' ')?;
86 let traditional = &line[..first_space];
87 let rest = line[first_space + 1..].trim_start();
88 let second_space = rest.find(' ')?;
89 let simplified = &rest[..second_space];
90 let rest = rest[second_space + 1..].trim_start();
91 let pinyin_start = rest.find('[')?;
92 let pinyin_end = rest.find(']')?;
93 if pinyin_end <= pinyin_start {
94 return None;
95 }
96 let pinyin = &rest[pinyin_start + 1..pinyin_end];
97 let definitions = rest[pinyin_end + 1..].trim();
98 let definitions = definitions.strip_prefix('/')?.strip_suffix('/')?;
99
100 let mut entry = DictEntry::new(
101 DictSource::Custom {
102 name: "cc-cedict".to_string(),
103 },
104 clean_text(simplified),
105 );
106 entry.phonetic_uk = Some(clean_text(pinyin));
107 entry.definitions = definitions
108 .split('/')
109 .map(clean_text)
110 .filter(|definition| !definition.is_empty())
111 .map(|definition| Definition::new(definition, simplified, None))
112 .collect();
113 entry.tags = vec!["cc-cedict".to_string()];
114 entry.extra = json!({
115 "traditional": traditional,
116 "pinyin": pinyin,
117 "license": "CC BY-SA",
118 });
119
120 if entry.definitions.is_empty() {
121 None
122 } else {
123 Some(entry)
124 }
125}
126
127#[cfg(test)]
128mod tests {
129 use super::*;
130 use std::io::Write;
131
132 #[test]
133 fn parses_cedict_line() {
134 let entry = parse_line("老師 老师 [lao3 shi1] /teacher/CL:個|个[ge4]/").unwrap();
135 assert_eq!(entry.word, "老师");
136 assert_eq!(entry.phonetic(), Some("lao3 shi1"));
137 assert_eq!(entry.definitions[0].en, "teacher");
138 }
139
140 #[test]
141 fn parses_cedict_file() {
142 let mut file = tempfile::NamedTempFile::new().unwrap();
143 writeln!(file, "# comment").unwrap();
144 writeln!(file, "老師 老师 [lao3 shi1] /teacher/").unwrap();
145
146 let entries = CedictParser
147 .parse(file.path())
148 .unwrap()
149 .collect::<Result<Vec<_>>>()
150 .unwrap();
151
152 assert_eq!(entries.len(), 1);
153 assert_eq!(entries[0].word, "老师");
154 }
155}