Skip to main content

dictx_parser/
ecdict.rs

1use crate::traits::{DictParser, ValidationReport};
2use dictx_core::{
3    clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Exchange, Result,
4};
5use serde::Deserialize;
6use serde_json::json;
7use std::collections::BTreeSet;
8use std::fs::File;
9use std::io::{BufRead, BufReader};
10use std::path::Path;
11
12pub struct EcdictParser;
13
14impl DictParser for EcdictParser {
15    fn name(&self) -> &'static str {
16        "ECDICT CSV"
17    }
18
19    fn format_id(&self) -> &'static str {
20        "ecdict"
21    }
22
23    fn validate(&self, path: &Path) -> Result<ValidationReport> {
24        let file = File::open(path)?;
25        let mut reader = csv::ReaderBuilder::new()
26            .has_headers(true)
27            .flexible(true)
28            .from_reader(file);
29        let headers = reader.headers()?.clone();
30        let required = ["word", "translation"];
31        let missing: Vec<_> = required
32            .iter()
33            .filter(|name| !headers.iter().any(|h| h == **name))
34            .copied()
35            .collect();
36
37        if !missing.is_empty() {
38            return Ok(ValidationReport::invalid(
39                self.format_id(),
40                format!("缺少必要列: {}", missing.join(", ")),
41            ));
42        }
43
44        Ok(ValidationReport::ok(
45            self.format_id(),
46            count_lines(path).ok(),
47        ))
48    }
49
50    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
51        let file = File::open(path)?;
52        let reader = csv::ReaderBuilder::new()
53            .has_headers(true)
54            .flexible(true)
55            .trim(csv::Trim::All)
56            .from_reader(file);
57
58        let iter = reader
59            .into_deserialize::<EcdictRawRow>()
60            .filter_map(|row| match row {
61                Ok(row) if row.word.trim().is_empty() => None,
62                Ok(row) => Some(row.into_entry()),
63                Err(err) => Some(Err(err.into())),
64            });
65
66        Ok(Box::new(iter))
67    }
68}
69
70#[derive(Debug, Deserialize)]
71struct EcdictRawRow {
72    word: String,
73    #[serde(default)]
74    phonetic: Option<String>,
75    #[serde(default)]
76    definition: Option<String>,
77    #[serde(default)]
78    translation: Option<String>,
79    #[serde(default)]
80    pos: Option<String>,
81    #[serde(default)]
82    collins: Option<String>,
83    #[serde(default)]
84    oxford: Option<String>,
85    #[serde(default)]
86    tag: Option<String>,
87    #[serde(default)]
88    bnc: Option<String>,
89    #[serde(default)]
90    frq: Option<String>,
91    #[serde(default)]
92    exchange: Option<String>,
93    #[serde(default)]
94    detail: Option<String>,
95    #[serde(default)]
96    audio: Option<String>,
97}
98
99impl EcdictRawRow {
100    fn into_entry(self) -> Result<DictEntry> {
101        let mut entry = DictEntry::new(DictSource::Ecdict, clean_text(&self.word));
102        entry.phonetic_uk = clean_optional(self.phonetic);
103        entry.collins_star = self
104            .collins
105            .as_deref()
106            .and_then(|value| value.trim().parse::<u8>().ok())
107            .unwrap_or(0)
108            .min(5);
109        entry.oxford_3000 = self
110            .oxford
111            .as_deref()
112            .map(|value| matches!(value.trim(), "1" | "true" | "yes"))
113            .unwrap_or(false);
114        entry.freq_bnc = parse_u32(self.bnc.as_deref());
115        entry.freq_coca = parse_u32(self.frq.as_deref());
116        entry.pos = split_pos(self.pos.as_deref());
117        entry.tags = self
118            .tag
119            .as_deref()
120            .unwrap_or_default()
121            .split_whitespace()
122            .map(normalize_tag)
123            .filter(|tag| !tag.is_empty())
124            .collect();
125        entry.exchanges = parse_exchanges(self.exchange.as_deref());
126        entry.definitions = pair_definitions(
127            self.definition.as_deref().unwrap_or_default(),
128            self.translation.as_deref().unwrap_or_default(),
129            entry.pos.first().cloned(),
130        );
131
132        let mut extra = serde_json::Map::new();
133        if let Some(detail) = clean_optional(self.detail) {
134            extra.insert(
135                "detail".to_string(),
136                serde_json::from_str(&detail).unwrap_or_else(|_| json!(detail)),
137            );
138        }
139        if let Some(audio) = clean_optional(self.audio) {
140            extra.insert("audio".to_string(), json!(audio));
141        }
142        if !extra.is_empty() {
143            entry.extra = serde_json::Value::Object(extra);
144        }
145
146        Ok(entry)
147    }
148}
149
150fn count_lines(path: &Path) -> std::io::Result<usize> {
151    let file = File::open(path)?;
152    let reader = BufReader::new(file);
153    Ok(reader.lines().count().saturating_sub(1))
154}
155
156fn clean_optional(value: Option<String>) -> Option<String> {
157    value
158        .map(clean_text)
159        .filter(|value| !value.trim().is_empty())
160}
161
162fn parse_u32(value: Option<&str>) -> Option<u32> {
163    value.and_then(|value| {
164        let trimmed = value.trim();
165        if trimmed.is_empty() || trimmed == "0" {
166            None
167        } else {
168            trimmed.parse().ok()
169        }
170    })
171}
172
173fn split_lines(value: &str) -> Vec<String> {
174    value
175        .replace("\\n", "\n")
176        .lines()
177        .map(clean_text)
178        .filter(|line| !line.is_empty())
179        .collect()
180}
181
182fn split_pos(value: Option<&str>) -> Vec<String> {
183    let mut set = BTreeSet::new();
184    for pos in value.unwrap_or_default().split(['/', ',', ';', ' ']) {
185        let pos = clean_pos(pos);
186        if !pos.is_empty() {
187            set.insert(pos);
188        }
189    }
190    set.into_iter().collect()
191}
192
193fn pair_definitions(en: &str, zh: &str, fallback_pos: Option<String>) -> Vec<Definition> {
194    let en_lines = split_lines(en);
195    let zh_lines = split_lines(zh);
196    let len = en_lines.len().max(zh_lines.len()).max(1);
197
198    (0..len)
199        .filter_map(|idx| {
200            let en = en_lines.get(idx).cloned().unwrap_or_default();
201            let zh = zh_lines.get(idx).cloned().unwrap_or_default();
202            if en.is_empty() && zh.is_empty() {
203                None
204            } else {
205                Some(Definition::new(en, zh, fallback_pos.clone()))
206            }
207        })
208        .collect()
209}
210
211fn parse_exchanges(value: Option<&str>) -> Vec<Exchange> {
212    value
213        .unwrap_or_default()
214        .split('/')
215        .filter_map(|chunk| {
216            let (kind, word) = chunk.split_once(':')?;
217            let word = clean_text(word);
218            if word.is_empty() {
219                None
220            } else {
221                Some(Exchange {
222                    kind: kind.trim().to_string(),
223                    word,
224                })
225            }
226        })
227        .collect()
228}
229
230#[cfg(test)]
231mod tests {
232    use super::*;
233    use std::io::Write;
234
235    #[test]
236    fn parses_ecdict_csv_row() {
237        let mut file = tempfile::NamedTempFile::new().unwrap();
238        writeln!(
239            file,
240            "word,phonetic,definition,translation,pos,collins,oxford,tag,bnc,frq,exchange,detail,audio"
241        )
242        .unwrap();
243        writeln!(
244            file,
245            "apple,ˈæpəl,\"a fruit\",\"苹果\",n,5,1,\"cet4 gk\",2764,3198,s:apples,,"
246        )
247        .unwrap();
248
249        let parser = EcdictParser;
250        let entries: Vec<_> = parser
251            .parse(file.path())
252            .unwrap()
253            .collect::<Result<Vec<_>>>()
254            .unwrap();
255
256        assert_eq!(entries.len(), 1);
257        assert_eq!(entries[0].word, "apple");
258        assert_eq!(entries[0].definitions[0].zh, "苹果");
259        assert_eq!(entries[0].tags, vec!["cet4", "gk"]);
260        assert_eq!(entries[0].freq_bnc, Some(2764));
261    }
262}