use crate::html::plain_text_from_html;
use crate::traits::{DictParser, ValidationReport};
use dictx_core::{clean_text, Definition, DictEntry, DictSource, Example, Phrase, Result};
use regex::Regex;
use serde_json::json;
use std::path::Path;
use std::sync::OnceLock;
pub struct MdxParser;
impl DictParser for MdxParser {
fn name(&self) -> &'static str {
"MDict MDX dictionary"
}
fn format_id(&self) -> &'static str {
"mdx"
}
fn validate(&self, path: &Path) -> Result<ValidationReport> {
let bytes = std::fs::read(path)?;
let dict = parse_mdx_bytes(&bytes)?;
Ok(ValidationReport::ok(
self.format_id(),
Some(dict.keys().count()),
))
}
fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
let bytes = std::fs::read(path)?;
let dict = parse_mdx_bytes(&bytes)?;
let filename = path
.file_stem()
.and_then(|name| name.to_str())
.unwrap_or("mdx")
.to_string();
let entries = dict
.items()
.filter_map(|record| entry_from_mdx_record(&filename, record.key, &record.definition))
.map(Ok)
.collect::<Vec<_>>();
Ok(Box::new(entries.into_iter()))
}
}
fn parse_mdx_bytes(bytes: &[u8]) -> Result<mdict_parser::mdict::Mdx> {
std::panic::catch_unwind(|| mdict_parser::parser::parse(bytes))
.map_err(|_| dictx_core::DictxError::InvalidData("MDX 解析失败".to_string()))
}
fn entry_from_mdx_record(filename: &str, key: &str, html: &str) -> Option<DictEntry> {
let word = clean_text(key);
if word.is_empty() || word.eq_ignore_ascii_case("freemdict") {
return None;
}
let html = html.trim_matches('\0').trim();
if html.is_empty() || html.starts_with("@@@LINK=") {
return None;
}
let plain = plain_text_from_html(html);
if plain.is_empty() {
return None;
}
let mut entry = DictEntry::new(
DictSource::Mdx {
filename: filename.to_string(),
},
word.clone(),
);
entry.tags.push("mdx".to_string());
let pos = extract_pos(&plain);
let mut definitions = extract_numbered_definitions(html);
definitions.dedup();
definitions.truncate(8);
if definitions.is_empty() {
if contains_cjk(&word) {
entry
.definitions
.push(Definition::new(plain.clone(), word.clone(), pos.clone()));
} else {
entry
.definitions
.push(Definition::new("", plain.clone(), pos.clone()));
}
} else {
for definition in definitions {
if contains_cjk(&word) {
entry
.definitions
.push(Definition::new(definition, word.clone(), pos.clone()));
} else {
entry
.definitions
.push(Definition::new("", definition, pos.clone()));
}
}
}
entry.examples = extract_examples(html);
entry.phrases = extract_related_phrases(html);
entry.extra = json!({
"format": "mdx",
"source_file": filename,
"plain_preview": plain.chars().take(240).collect::<String>(),
});
if entry.definitions.is_empty() && entry.examples.is_empty() && entry.phrases.is_empty() {
None
} else {
Some(entry)
}
}
fn extract_numbered_definitions(html: &str) -> Vec<String> {
let mut out = Vec::new();
for captures in numbered_definition_re().captures_iter(html) {
let Some(value) = captures.name("body") else {
continue;
};
let text = plain_text_from_html(value.as_str());
if !text.is_empty() && !contains_cjk(&text) {
out.push(text);
}
}
out
}
fn extract_examples(html: &str) -> Vec<Example> {
let mut examples = Vec::new();
for captures in info_cite_re().captures_iter(html) {
let Some(body) = captures.name("body") else {
continue;
};
let ps = paragraph_texts(body.as_str());
if ps.len() < 2 {
continue;
}
let left = &ps[0];
let right = &ps[1];
let (zh, en) = if contains_cjk(left) && !contains_cjk(right) {
(left.clone(), right.clone())
} else if contains_cjk(right) && !contains_cjk(left) {
(right.clone(), left.clone())
} else {
continue;
};
if !en.is_empty() && !zh.is_empty() {
examples.push(Example { en, zh });
}
if examples.len() >= 8 {
break;
}
}
examples
}
fn extract_related_phrases(html: &str) -> Vec<Phrase> {
let mut phrases = Vec::new();
for captures in related_phrase_re().captures_iter(html) {
let Some(raw) = captures.name("body") else {
continue;
};
let text = plain_text_from_html(raw.as_str());
let Some((zh, en)) = split_related_phrase(&text) else {
continue;
};
phrases.push(Phrase { en, zh });
if phrases.len() >= 8 {
break;
}
}
phrases
}
fn paragraph_texts(html: &str) -> Vec<String> {
paragraph_re()
.captures_iter(html)
.filter_map(|captures| captures.name("body"))
.map(|value| plain_text_from_html(value.as_str()))
.filter(|value| !value.is_empty())
.collect()
}
fn split_related_phrase(text: &str) -> Option<(String, String)> {
let text = clean_text(text);
let (zh, rest) = text.split_once(']')?;
let zh = zh
.split('[')
.next()
.map(clean_text)
.filter(|value| !value.is_empty())?;
let en = clean_text(rest);
if en.is_empty() {
None
} else {
Some((zh, en))
}
}
fn extract_pos(plain: &str) -> Option<String> {
pos_re()
.captures(plain)
.and_then(|captures| captures.get(1))
.map(|value| value.as_str().trim().to_string())
.filter(|value| !value.is_empty())
}
fn contains_cjk(value: &str) -> bool {
value
.chars()
.any(|ch| ('\u{4e00}'..='\u{9fff}').contains(&ch))
}
fn numbered_definition_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r#"(?is)<li[^>]*>\s*<i[^>]*class=["']number["'][^>]*>\s*\d+\s*</i>\s*<p[^>]*>(?P<body>.*?)</p>"#).unwrap()
})
}
fn info_cite_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r#"(?is)<div[^>]*class=["'][^"']*info-cite[^"']*["'][^>]*>(?P<body>.*?)</div>"#)
.unwrap()
})
}
fn paragraph_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| Regex::new(r#"(?is)<p[^>]*>(?P<body>.*?)</p>"#).unwrap())
}
fn related_phrase_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
Regex::new(r#"(?is)<p[^>]*class=["'][^"']*gray[^"']*["'][^>]*>(?P<body>.*?)</p>"#).unwrap()
})
}
fn pos_re() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| Regex::new(r"\[([^]]+)\]").unwrap())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extracts_chinese_entry_from_mdx_html() {
let html = r#"
<span class="entry_head">苹果</span>
<h5><span class="bold">[名] </span></h5>
<ol class="info-list">
<li><i class="number">1</i><p>apple (the tree and its fruit)</p>
<div class="info-cite">
<p><em>苹果</em>广泛种植于温带地区。</p>
<p><span class="italic">The apple is widely grown in temperate regions.</span></p>
</div>
</li>
</ol>
<p class="gray"><span class="bold">苹果汁</span>[名] apple cider/extract</p>
"#;
let entry = entry_from_mdx_record("新世纪汉英大词典", "苹果", html).unwrap();
assert_eq!(entry.word, "苹果");
assert_eq!(entry.definitions[0].en, "apple (the tree and its fruit)");
assert_eq!(entry.definitions[0].zh, "苹果");
assert_eq!(
entry.examples[0].en,
"The apple is widely grown in temperate regions."
);
assert_eq!(entry.phrases[0].zh, "苹果汁");
assert_eq!(entry.phrases[0].en, "apple cider/extract");
}
#[test]
fn skips_mdx_link_records() {
assert!(entry_from_mdx_record("dict", "苹果", "@@@LINK=apple").is_none());
}
}