1use crate::traits::{DictParser, ValidationReport};
2use dictx_core::{
3 clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Exchange, Result,
4};
5use serde::Deserialize;
6use serde_json::json;
7use std::collections::BTreeSet;
8use std::fs::File;
9use std::io::{BufRead, BufReader};
10use std::path::Path;
11
12pub struct EcdictParser;
13
14impl DictParser for EcdictParser {
15 fn name(&self) -> &'static str {
16 "ECDICT CSV"
17 }
18
19 fn format_id(&self) -> &'static str {
20 "ecdict"
21 }
22
23 fn validate(&self, path: &Path) -> Result<ValidationReport> {
24 let file = File::open(path)?;
25 let mut reader = csv::ReaderBuilder::new()
26 .has_headers(true)
27 .flexible(true)
28 .from_reader(file);
29 let headers = reader.headers()?.clone();
30 let required = ["word", "translation"];
31 let missing: Vec<_> = required
32 .iter()
33 .filter(|name| !headers.iter().any(|h| h == **name))
34 .copied()
35 .collect();
36
37 if !missing.is_empty() {
38 return Ok(ValidationReport::invalid(
39 self.format_id(),
40 format!("缺少必要列: {}", missing.join(", ")),
41 ));
42 }
43
44 Ok(ValidationReport::ok(
45 self.format_id(),
46 count_lines(path).ok(),
47 ))
48 }
49
50 fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
51 let file = File::open(path)?;
52 let reader = csv::ReaderBuilder::new()
53 .has_headers(true)
54 .flexible(true)
55 .trim(csv::Trim::All)
56 .from_reader(file);
57
58 let iter = reader
59 .into_deserialize::<EcdictRawRow>()
60 .filter_map(|row| match row {
61 Ok(row) if row.word.trim().is_empty() => None,
62 Ok(row) => Some(row.into_entry()),
63 Err(err) => Some(Err(err.into())),
64 });
65
66 Ok(Box::new(iter))
67 }
68}
69
70#[derive(Debug, Deserialize)]
71struct EcdictRawRow {
72 word: String,
73 #[serde(default)]
74 phonetic: Option<String>,
75 #[serde(default)]
76 definition: Option<String>,
77 #[serde(default)]
78 translation: Option<String>,
79 #[serde(default)]
80 pos: Option<String>,
81 #[serde(default)]
82 collins: Option<String>,
83 #[serde(default)]
84 oxford: Option<String>,
85 #[serde(default)]
86 tag: Option<String>,
87 #[serde(default)]
88 bnc: Option<String>,
89 #[serde(default)]
90 frq: Option<String>,
91 #[serde(default)]
92 exchange: Option<String>,
93 #[serde(default)]
94 detail: Option<String>,
95 #[serde(default)]
96 audio: Option<String>,
97}
98
99impl EcdictRawRow {
100 fn into_entry(self) -> Result<DictEntry> {
101 let mut entry = DictEntry::new(DictSource::Ecdict, clean_text(&self.word));
102 entry.phonetic_uk = clean_optional(self.phonetic);
103 entry.collins_star = self
104 .collins
105 .as_deref()
106 .and_then(|value| value.trim().parse::<u8>().ok())
107 .unwrap_or(0)
108 .min(5);
109 entry.oxford_3000 = self
110 .oxford
111 .as_deref()
112 .map(|value| matches!(value.trim(), "1" | "true" | "yes"))
113 .unwrap_or(false);
114 entry.freq_bnc = parse_u32(self.bnc.as_deref());
115 entry.freq_coca = parse_u32(self.frq.as_deref());
116 entry.pos = split_pos(self.pos.as_deref());
117 entry.tags = self
118 .tag
119 .as_deref()
120 .unwrap_or_default()
121 .split_whitespace()
122 .map(normalize_tag)
123 .filter(|tag| !tag.is_empty())
124 .collect();
125 entry.exchanges = parse_exchanges(self.exchange.as_deref());
126 entry.definitions = pair_definitions(
127 self.definition.as_deref().unwrap_or_default(),
128 self.translation.as_deref().unwrap_or_default(),
129 entry.pos.first().cloned(),
130 );
131
132 let mut extra = serde_json::Map::new();
133 if let Some(detail) = clean_optional(self.detail) {
134 extra.insert(
135 "detail".to_string(),
136 serde_json::from_str(&detail).unwrap_or_else(|_| json!(detail)),
137 );
138 }
139 if let Some(audio) = clean_optional(self.audio) {
140 extra.insert("audio".to_string(), json!(audio));
141 }
142 if !extra.is_empty() {
143 entry.extra = serde_json::Value::Object(extra);
144 }
145
146 Ok(entry)
147 }
148}
149
150fn count_lines(path: &Path) -> std::io::Result<usize> {
151 let file = File::open(path)?;
152 let reader = BufReader::new(file);
153 Ok(reader.lines().count().saturating_sub(1))
154}
155
156fn clean_optional(value: Option<String>) -> Option<String> {
157 value
158 .map(clean_text)
159 .filter(|value| !value.trim().is_empty())
160}
161
162fn parse_u32(value: Option<&str>) -> Option<u32> {
163 value.and_then(|value| {
164 let trimmed = value.trim();
165 if trimmed.is_empty() || trimmed == "0" {
166 None
167 } else {
168 trimmed.parse().ok()
169 }
170 })
171}
172
173fn split_lines(value: &str) -> Vec<String> {
174 value
175 .replace("\\n", "\n")
176 .lines()
177 .map(clean_text)
178 .filter(|line| !line.is_empty())
179 .collect()
180}
181
182fn split_pos(value: Option<&str>) -> Vec<String> {
183 let mut set = BTreeSet::new();
184 for pos in value.unwrap_or_default().split(['/', ',', ';', ' ']) {
185 let pos = clean_pos(pos);
186 if !pos.is_empty() {
187 set.insert(pos);
188 }
189 }
190 set.into_iter().collect()
191}
192
193fn pair_definitions(en: &str, zh: &str, fallback_pos: Option<String>) -> Vec<Definition> {
194 let en_lines = split_lines(en);
195 let zh_lines = split_lines(zh);
196 let len = en_lines.len().max(zh_lines.len()).max(1);
197
198 (0..len)
199 .filter_map(|idx| {
200 let en = en_lines.get(idx).cloned().unwrap_or_default();
201 let zh = zh_lines.get(idx).cloned().unwrap_or_default();
202 if en.is_empty() && zh.is_empty() {
203 None
204 } else {
205 Some(Definition::new(en, zh, fallback_pos.clone()))
206 }
207 })
208 .collect()
209}
210
211fn parse_exchanges(value: Option<&str>) -> Vec<Exchange> {
212 value
213 .unwrap_or_default()
214 .split('/')
215 .filter_map(|chunk| {
216 let (kind, word) = chunk.split_once(':')?;
217 let word = clean_text(word);
218 if word.is_empty() {
219 None
220 } else {
221 Some(Exchange {
222 kind: kind.trim().to_string(),
223 word,
224 })
225 }
226 })
227 .collect()
228}
229
230#[cfg(test)]
231mod tests {
232 use super::*;
233 use std::io::Write;
234
235 #[test]
236 fn parses_ecdict_csv_row() {
237 let mut file = tempfile::NamedTempFile::new().unwrap();
238 writeln!(
239 file,
240 "word,phonetic,definition,translation,pos,collins,oxford,tag,bnc,frq,exchange,detail,audio"
241 )
242 .unwrap();
243 writeln!(
244 file,
245 "apple,ˈæpəl,\"a fruit\",\"苹果\",n,5,1,\"cet4 gk\",2764,3198,s:apples,,"
246 )
247 .unwrap();
248
249 let parser = EcdictParser;
250 let entries: Vec<_> = parser
251 .parse(file.path())
252 .unwrap()
253 .collect::<Result<Vec<_>>>()
254 .unwrap();
255
256 assert_eq!(entries.len(), 1);
257 assert_eq!(entries[0].word, "apple");
258 assert_eq!(entries[0].definitions[0].zh, "苹果");
259 assert_eq!(entries[0].tags, vec!["cet4", "gk"]);
260 assert_eq!(entries[0].freq_bnc, Some(2764));
261 }
262}