dictx-parser 0.1.1

Dictionary source parsers for DictX.
Documentation
use crate::traits::{DictParser, ValidationReport};
use dictx_core::{
    clean_pos, clean_text, normalize_tag, Definition, DictEntry, DictSource, Exchange, Result,
};
use serde::Deserialize;
use serde_json::json;
use std::collections::BTreeSet;
use std::fs::File;
use std::io::{BufRead, BufReader};
use std::path::Path;

pub struct EcdictParser;

impl DictParser for EcdictParser {
    fn name(&self) -> &'static str {
        "ECDICT CSV"
    }

    fn format_id(&self) -> &'static str {
        "ecdict"
    }

    fn validate(&self, path: &Path) -> Result<ValidationReport> {
        let file = File::open(path)?;
        let mut reader = csv::ReaderBuilder::new()
            .has_headers(true)
            .flexible(true)
            .from_reader(file);
        let headers = reader.headers()?.clone();
        let required = ["word", "translation"];
        let missing: Vec<_> = required
            .iter()
            .filter(|name| !headers.iter().any(|h| h == **name))
            .copied()
            .collect();

        if !missing.is_empty() {
            return Ok(ValidationReport::invalid(
                self.format_id(),
                format!("缺少必要列: {}", missing.join(", ")),
            ));
        }

        Ok(ValidationReport::ok(
            self.format_id(),
            count_lines(path).ok(),
        ))
    }

    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
        let file = File::open(path)?;
        let reader = csv::ReaderBuilder::new()
            .has_headers(true)
            .flexible(true)
            .trim(csv::Trim::All)
            .from_reader(file);

        let iter = reader
            .into_deserialize::<EcdictRawRow>()
            .filter_map(|row| match row {
                Ok(row) if row.word.trim().is_empty() => None,
                Ok(row) => Some(row.into_entry()),
                Err(err) => Some(Err(err.into())),
            });

        Ok(Box::new(iter))
    }
}

#[derive(Debug, Deserialize)]
struct EcdictRawRow {
    word: String,
    #[serde(default)]
    phonetic: Option<String>,
    #[serde(default)]
    definition: Option<String>,
    #[serde(default)]
    translation: Option<String>,
    #[serde(default)]
    pos: Option<String>,
    #[serde(default)]
    collins: Option<String>,
    #[serde(default)]
    oxford: Option<String>,
    #[serde(default)]
    tag: Option<String>,
    #[serde(default)]
    bnc: Option<String>,
    #[serde(default)]
    frq: Option<String>,
    #[serde(default)]
    exchange: Option<String>,
    #[serde(default)]
    detail: Option<String>,
    #[serde(default)]
    audio: Option<String>,
}

impl EcdictRawRow {
    fn into_entry(self) -> Result<DictEntry> {
        let mut entry = DictEntry::new(DictSource::Ecdict, clean_text(&self.word));
        entry.phonetic_uk = clean_optional(self.phonetic);
        entry.collins_star = self
            .collins
            .as_deref()
            .and_then(|value| value.trim().parse::<u8>().ok())
            .unwrap_or(0)
            .min(5);
        entry.oxford_3000 = self
            .oxford
            .as_deref()
            .map(|value| matches!(value.trim(), "1" | "true" | "yes"))
            .unwrap_or(false);
        entry.freq_bnc = parse_u32(self.bnc.as_deref());
        entry.freq_coca = parse_u32(self.frq.as_deref());
        entry.pos = split_pos(self.pos.as_deref());
        entry.tags = self
            .tag
            .as_deref()
            .unwrap_or_default()
            .split_whitespace()
            .map(normalize_tag)
            .filter(|tag| !tag.is_empty())
            .collect();
        entry.exchanges = parse_exchanges(self.exchange.as_deref());
        entry.definitions = pair_definitions(
            self.definition.as_deref().unwrap_or_default(),
            self.translation.as_deref().unwrap_or_default(),
            entry.pos.first().cloned(),
        );

        let mut extra = serde_json::Map::new();
        if let Some(detail) = clean_optional(self.detail) {
            extra.insert(
                "detail".to_string(),
                serde_json::from_str(&detail).unwrap_or_else(|_| json!(detail)),
            );
        }
        if let Some(audio) = clean_optional(self.audio) {
            extra.insert("audio".to_string(), json!(audio));
        }
        if !extra.is_empty() {
            entry.extra = serde_json::Value::Object(extra);
        }

        Ok(entry)
    }
}

fn count_lines(path: &Path) -> std::io::Result<usize> {
    let file = File::open(path)?;
    let reader = BufReader::new(file);
    Ok(reader.lines().count().saturating_sub(1))
}

fn clean_optional(value: Option<String>) -> Option<String> {
    value
        .map(clean_text)
        .filter(|value| !value.trim().is_empty())
}

fn parse_u32(value: Option<&str>) -> Option<u32> {
    value.and_then(|value| {
        let trimmed = value.trim();
        if trimmed.is_empty() || trimmed == "0" {
            None
        } else {
            trimmed.parse().ok()
        }
    })
}

fn split_lines(value: &str) -> Vec<String> {
    value
        .replace("\\n", "\n")
        .lines()
        .map(clean_text)
        .filter(|line| !line.is_empty())
        .collect()
}

fn split_pos(value: Option<&str>) -> Vec<String> {
    let mut set = BTreeSet::new();
    for pos in value.unwrap_or_default().split(['/', ',', ';', ' ']) {
        let pos = clean_pos(pos);
        if !pos.is_empty() {
            set.insert(pos);
        }
    }
    set.into_iter().collect()
}

fn pair_definitions(en: &str, zh: &str, fallback_pos: Option<String>) -> Vec<Definition> {
    let en_lines = split_lines(en);
    let zh_lines = split_lines(zh);
    let len = en_lines.len().max(zh_lines.len()).max(1);

    (0..len)
        .filter_map(|idx| {
            let en = en_lines.get(idx).cloned().unwrap_or_default();
            let zh = zh_lines.get(idx).cloned().unwrap_or_default();
            if en.is_empty() && zh.is_empty() {
                None
            } else {
                Some(Definition::new(en, zh, fallback_pos.clone()))
            }
        })
        .collect()
}

fn parse_exchanges(value: Option<&str>) -> Vec<Exchange> {
    value
        .unwrap_or_default()
        .split('/')
        .filter_map(|chunk| {
            let (kind, word) = chunk.split_once(':')?;
            let word = clean_text(word);
            if word.is_empty() {
                None
            } else {
                Some(Exchange {
                    kind: kind.trim().to_string(),
                    word,
                })
            }
        })
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;

    #[test]
    fn parses_ecdict_csv_row() {
        let mut file = tempfile::NamedTempFile::new().unwrap();
        writeln!(
            file,
            "word,phonetic,definition,translation,pos,collins,oxford,tag,bnc,frq,exchange,detail,audio"
        )
        .unwrap();
        writeln!(
            file,
            "apple,ˈæpəl,\"a fruit\",\"苹果\",n,5,1,\"cet4 gk\",2764,3198,s:apples,,"
        )
        .unwrap();

        let parser = EcdictParser;
        let entries: Vec<_> = parser
            .parse(file.path())
            .unwrap()
            .collect::<Result<Vec<_>>>()
            .unwrap();

        assert_eq!(entries.len(), 1);
        assert_eq!(entries[0].word, "apple");
        assert_eq!(entries[0].definitions[0].zh, "苹果");
        assert_eq!(entries[0].tags, vec!["cet4", "gk"]);
        assert_eq!(entries[0].freq_bnc, Some(2764));
    }
}