dictx-parser 0.1.1

Dictionary source parsers for DictX.
Documentation
use crate::html::plain_text_from_html;
use crate::traits::{DictParser, ValidationReport};
use dictx_core::{clean_text, Definition, DictEntry, DictSource, Example, Phrase, Result};
use regex::Regex;
use serde_json::json;
use std::path::Path;
use std::sync::OnceLock;

pub struct MdxParser;

impl DictParser for MdxParser {
    fn name(&self) -> &'static str {
        "MDict MDX dictionary"
    }

    fn format_id(&self) -> &'static str {
        "mdx"
    }

    fn validate(&self, path: &Path) -> Result<ValidationReport> {
        let bytes = std::fs::read(path)?;
        let dict = parse_mdx_bytes(&bytes)?;
        Ok(ValidationReport::ok(
            self.format_id(),
            Some(dict.keys().count()),
        ))
    }

    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
        let bytes = std::fs::read(path)?;
        let dict = parse_mdx_bytes(&bytes)?;
        let filename = path
            .file_stem()
            .and_then(|name| name.to_str())
            .unwrap_or("mdx")
            .to_string();
        let entries = dict
            .items()
            .filter_map(|record| entry_from_mdx_record(&filename, record.key, &record.definition))
            .map(Ok)
            .collect::<Vec<_>>();
        Ok(Box::new(entries.into_iter()))
    }
}

fn parse_mdx_bytes(bytes: &[u8]) -> Result<mdict_parser::mdict::Mdx> {
    std::panic::catch_unwind(|| mdict_parser::parser::parse(bytes))
        .map_err(|_| dictx_core::DictxError::InvalidData("MDX 解析失败".to_string()))
}

fn entry_from_mdx_record(filename: &str, key: &str, html: &str) -> Option<DictEntry> {
    let word = clean_text(key);
    if word.is_empty() || word.eq_ignore_ascii_case("freemdict") {
        return None;
    }

    let html = html.trim_matches('\0').trim();
    if html.is_empty() || html.starts_with("@@@LINK=") {
        return None;
    }

    let plain = plain_text_from_html(html);
    if plain.is_empty() {
        return None;
    }

    let mut entry = DictEntry::new(
        DictSource::Mdx {
            filename: filename.to_string(),
        },
        word.clone(),
    );
    entry.tags.push("mdx".to_string());

    let pos = extract_pos(&plain);
    let mut definitions = extract_numbered_definitions(html);
    definitions.dedup();
    definitions.truncate(8);

    if definitions.is_empty() {
        if contains_cjk(&word) {
            entry
                .definitions
                .push(Definition::new(plain.clone(), word.clone(), pos.clone()));
        } else {
            entry
                .definitions
                .push(Definition::new("", plain.clone(), pos.clone()));
        }
    } else {
        for definition in definitions {
            if contains_cjk(&word) {
                entry
                    .definitions
                    .push(Definition::new(definition, word.clone(), pos.clone()));
            } else {
                entry
                    .definitions
                    .push(Definition::new("", definition, pos.clone()));
            }
        }
    }

    entry.examples = extract_examples(html);
    entry.phrases = extract_related_phrases(html);
    entry.extra = json!({
        "format": "mdx",
        "source_file": filename,
        "plain_preview": plain.chars().take(240).collect::<String>(),
    });

    if entry.definitions.is_empty() && entry.examples.is_empty() && entry.phrases.is_empty() {
        None
    } else {
        Some(entry)
    }
}

fn extract_numbered_definitions(html: &str) -> Vec<String> {
    let mut out = Vec::new();
    for captures in numbered_definition_re().captures_iter(html) {
        let Some(value) = captures.name("body") else {
            continue;
        };
        let text = plain_text_from_html(value.as_str());
        if !text.is_empty() && !contains_cjk(&text) {
            out.push(text);
        }
    }
    out
}

fn extract_examples(html: &str) -> Vec<Example> {
    let mut examples = Vec::new();
    for captures in info_cite_re().captures_iter(html) {
        let Some(body) = captures.name("body") else {
            continue;
        };
        let ps = paragraph_texts(body.as_str());
        if ps.len() < 2 {
            continue;
        }
        let left = &ps[0];
        let right = &ps[1];
        let (zh, en) = if contains_cjk(left) && !contains_cjk(right) {
            (left.clone(), right.clone())
        } else if contains_cjk(right) && !contains_cjk(left) {
            (right.clone(), left.clone())
        } else {
            continue;
        };
        if !en.is_empty() && !zh.is_empty() {
            examples.push(Example { en, zh });
        }
        if examples.len() >= 8 {
            break;
        }
    }
    examples
}

fn extract_related_phrases(html: &str) -> Vec<Phrase> {
    let mut phrases = Vec::new();
    for captures in related_phrase_re().captures_iter(html) {
        let Some(raw) = captures.name("body") else {
            continue;
        };
        let text = plain_text_from_html(raw.as_str());
        let Some((zh, en)) = split_related_phrase(&text) else {
            continue;
        };
        phrases.push(Phrase { en, zh });
        if phrases.len() >= 8 {
            break;
        }
    }
    phrases
}

fn paragraph_texts(html: &str) -> Vec<String> {
    paragraph_re()
        .captures_iter(html)
        .filter_map(|captures| captures.name("body"))
        .map(|value| plain_text_from_html(value.as_str()))
        .filter(|value| !value.is_empty())
        .collect()
}

fn split_related_phrase(text: &str) -> Option<(String, String)> {
    let text = clean_text(text);
    let (zh, rest) = text.split_once(']')?;
    let zh = zh
        .split('[')
        .next()
        .map(clean_text)
        .filter(|value| !value.is_empty())?;
    let en = clean_text(rest);
    if en.is_empty() {
        None
    } else {
        Some((zh, en))
    }
}

fn extract_pos(plain: &str) -> Option<String> {
    pos_re()
        .captures(plain)
        .and_then(|captures| captures.get(1))
        .map(|value| value.as_str().trim().to_string())
        .filter(|value| !value.is_empty())
}

fn contains_cjk(value: &str) -> bool {
    value
        .chars()
        .any(|ch| ('\u{4e00}'..='\u{9fff}').contains(&ch))
}

fn numbered_definition_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| {
        Regex::new(r#"(?is)<li[^>]*>\s*<i[^>]*class=["']number["'][^>]*>\s*\d+\s*</i>\s*<p[^>]*>(?P<body>.*?)</p>"#).unwrap()
    })
}

fn info_cite_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| {
        Regex::new(r#"(?is)<div[^>]*class=["'][^"']*info-cite[^"']*["'][^>]*>(?P<body>.*?)</div>"#)
            .unwrap()
    })
}

fn paragraph_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| Regex::new(r#"(?is)<p[^>]*>(?P<body>.*?)</p>"#).unwrap())
}

fn related_phrase_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| {
        Regex::new(r#"(?is)<p[^>]*class=["'][^"']*gray[^"']*["'][^>]*>(?P<body>.*?)</p>"#).unwrap()
    })
}

fn pos_re() -> &'static Regex {
    static RE: OnceLock<Regex> = OnceLock::new();
    RE.get_or_init(|| Regex::new(r"\[([^]]+)\]").unwrap())
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn extracts_chinese_entry_from_mdx_html() {
        let html = r#"
            <span class="entry_head">苹果</span>
            <h5><span class="bold">[名] </span></h5>
            <ol class="info-list">
              <li><i class="number">1</i><p>apple (the tree and its fruit)</p>
                <div class="info-cite">
                  <p><em>苹果</em>广泛种植于温带地区。</p>
                  <p><span class="italic">The apple is widely grown in temperate regions.</span></p>
                </div>
              </li>
            </ol>
            <p class="gray"><span class="bold">苹果汁</span>[名] apple cider/extract</p>
        "#;

        let entry = entry_from_mdx_record("新世纪汉英大词典", "苹果", html).unwrap();

        assert_eq!(entry.word, "苹果");
        assert_eq!(entry.definitions[0].en, "apple (the tree and its fruit)");
        assert_eq!(entry.definitions[0].zh, "苹果");
        assert_eq!(
            entry.examples[0].en,
            "The apple is widely grown in temperate regions."
        );
        assert_eq!(entry.phrases[0].zh, "苹果汁");
        assert_eq!(entry.phrases[0].en, "apple cider/extract");
    }

    #[test]
    fn skips_mdx_link_records() {
        assert!(entry_from_mdx_record("dict", "苹果", "@@@LINK=apple").is_none());
    }
}