use crate::traits::{DictParser, ValidationReport};
use dictx_core::{
clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Exchange, Result,
};
use serde::Deserialize;
use serde_json::json;
use std::collections::BTreeSet;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;
pub struct EcdictParser;
impl DictParser for EcdictParser {
fn name(&self) -> &'static str {
"ECDICT CSV"
}
fn format_id(&self) -> &'static str {
"ecdict"
}
fn validate(&self, path: &Path) -> Result<ValidationReport> {
let file = File::open(path)?;
let mut reader = csv::ReaderBuilder::new()
.has_headers(true)
.flexible(true)
.from_reader(file);
let headers = reader.headers()?.clone();
let required = ["word", "translation"];
let missing: Vec<_> = required
.iter()
.filter(|name| !headers.iter().any(|h| h == **name))
.copied()
.collect();
if !missing.is_empty() {
return Ok(ValidationReport::invalid(
self.format_id(),
format!("缺少必要列: {}", missing.join(", ")),
));
}
Ok(ValidationReport::ok(
self.format_id(),
count_lines(path).ok(),
))
}
fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
let file = File::open(path)?;
let reader = csv::ReaderBuilder::new()
.has_headers(true)
.flexible(true)
.trim(csv::Trim::All)
.from_reader(file);
let iter = reader
.into_deserialize::<EcdictRawRow>()
.filter_map(|row| match row {
Ok(row) if row.word.trim().is_empty() => None,
Ok(row) => Some(row.into_entry()),
Err(err) => Some(Err(err.into())),
});
Ok(Box::new(iter))
}
}
#[derive(Debug, Deserialize)]
struct EcdictRawRow {
word: String,
#[serde(default)]
phonetic: Option<String>,
#[serde(default)]
definition: Option<String>,
#[serde(default)]
translation: Option<String>,
#[serde(default)]
pos: Option<String>,
#[serde(default)]
collins: Option<String>,
#[serde(default)]
oxford: Option<String>,
#[serde(default)]
tag: Option<String>,
#[serde(default)]
bnc: Option<String>,
#[serde(default)]
frq: Option<String>,
#[serde(default)]
exchange: Option<String>,
#[serde(default)]
detail: Option<String>,
#[serde(default)]
audio: Option<String>,
}
impl EcdictRawRow {
fn into_entry(self) -> Result<DictEntry> {
let mut entry = DictEntry::new(DictSource::Ecdict, clean_text(&self.word));
entry.phonetic_uk = clean_optional(self.phonetic);
entry.collins_star = self
.collins
.as_deref()
.and_then(|value| value.trim().parse::<u8>().ok())
.unwrap_or(0)
.min(5);
entry.oxford_3000 = self
.oxford
.as_deref()
.map(|value| matches!(value.trim(), "1" | "true" | "yes"))
.unwrap_or(false);
entry.freq_bnc = parse_u32(self.bnc.as_deref());
entry.freq_coca = parse_u32(self.frq.as_deref());
entry.pos = split_pos(self.pos.as_deref());
entry.tags = self
.tag
.as_deref()
.unwrap_or_default()
.split_whitespace()
.map(normalize_tag)
.filter(|tag| !tag.is_empty())
.collect();
entry.exchanges = parse_exchanges(self.exchange.as_deref());
entry.definitions = pair_definitions(
self.definition.as_deref().unwrap_or_default(),
self.translation.as_deref().unwrap_or_default(),
entry.pos.first().cloned(),
);
let mut extra = serde_json::Map::new();
if let Some(detail) = clean_optional(self.detail) {
extra.insert(
"detail".to_string(),
serde_json::from_str(&detail).unwrap_or_else(|_| json!(detail)),
);
}
if let Some(audio) = clean_optional(self.audio) {
extra.insert("audio".to_string(), json!(audio));
}
if !extra.is_empty() {
entry.extra = serde_json::Value::Object(extra);
}
Ok(entry)
}
}
fn count_lines(path: &Path) -> std::io::Result<usize> {
let file = File::open(path)?;
let reader = BufReader::new(file);
Ok(reader.lines().count().saturating_sub(1))
}
fn clean_optional(value: Option<String>) -> Option<String> {
value
.map(clean_text)
.filter(|value| !value.trim().is_empty())
}
fn parse_u32(value: Option<&str>) -> Option<u32> {
value.and_then(|value| {
let trimmed = value.trim();
if trimmed.is_empty() || trimmed == "0" {
None
} else {
trimmed.parse().ok()
}
})
}
fn split_lines(value: &str) -> Vec<String> {
value
.replace("\\n", "\n")
.lines()
.map(clean_text)
.filter(|line| !line.is_empty())
.collect()
}
fn split_pos(value: Option<&str>) -> Vec<String> {
let mut set = BTreeSet::new();
for pos in value.unwrap_or_default().split(['/', ',', ';', ' ']) {
let pos = clean_pos(pos);
if !pos.is_empty() {
set.insert(pos);
}
}
set.into_iter().collect()
}
fn pair_definitions(en: &str, zh: &str, fallback_pos: Option<String>) -> Vec<Definition> {
let en_lines = split_lines(en);
let zh_lines = split_lines(zh);
let len = en_lines.len().max(zh_lines.len()).max(1);
(0..len)
.filter_map(|idx| {
let en = en_lines.get(idx).cloned().unwrap_or_default();
let zh = zh_lines.get(idx).cloned().unwrap_or_default();
if en.is_empty() && zh.is_empty() {
None
} else {
Some(Definition::new(en, zh, fallback_pos.clone()))
}
})
.collect()
}
fn parse_exchanges(value: Option<&str>) -> Vec<Exchange> {
value
.unwrap_or_default()
.split('/')
.filter_map(|chunk| {
let (kind, word) = chunk.split_once(':')?;
let word = clean_text(word);
if word.is_empty() {
None
} else {
Some(Exchange {
kind: kind.trim().to_string(),
word,
})
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
#[test]
fn parses_ecdict_csv_row() {
let mut file = tempfile::NamedTempFile::new().unwrap();
writeln!(
file,
"word,phonetic,definition,translation,pos,collins,oxford,tag,bnc,frq,exchange,detail,audio"
)
.unwrap();
writeln!(
file,
"apple,ˈæpəl,\"a fruit\",\"苹果\",n,5,1,\"cet4 gk\",2764,3198,s:apples,,"
)
.unwrap();
let parser = EcdictParser;
let entries: Vec<_> = parser
.parse(file.path())
.unwrap()
.collect::<Result<Vec<_>>>()
.unwrap();
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].word, "apple");
assert_eq!(entries[0].definitions[0].zh, "苹果");
assert_eq!(entries[0].tags, vec!["cet4", "gk"]);
assert_eq!(entries[0].freq_bnc, Some(2764));
}
}