Skip to main content

dictx_parser/
mdx.rs

1use crate::html::plain_text_from_html;
2use crate::traits::{DictParser, ValidationReport};
3use dictx_core::{clean_text, Definition, DictEntry, DictSource, Example, Phrase, Result};
4use regex::Regex;
5use serde_json::json;
6use std::path::Path;
7use std::sync::OnceLock;
8
9pub struct MdxParser;
10
11impl DictParser for MdxParser {
12    fn name(&self) -> &'static str {
13        "MDict MDX dictionary"
14    }
15
16    fn format_id(&self) -> &'static str {
17        "mdx"
18    }
19
20    fn validate(&self, path: &Path) -> Result<ValidationReport> {
21        let bytes = std::fs::read(path)?;
22        let dict = parse_mdx_bytes(&bytes)?;
23        Ok(ValidationReport::ok(
24            self.format_id(),
25            Some(dict.keys().count()),
26        ))
27    }
28
29    fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
30        let bytes = std::fs::read(path)?;
31        let dict = parse_mdx_bytes(&bytes)?;
32        let filename = path
33            .file_stem()
34            .and_then(|name| name.to_str())
35            .unwrap_or("mdx")
36            .to_string();
37        let entries = dict
38            .items()
39            .filter_map(|record| entry_from_mdx_record(&filename, record.key, &record.definition))
40            .map(Ok)
41            .collect::<Vec<_>>();
42        Ok(Box::new(entries.into_iter()))
43    }
44}
45
46fn parse_mdx_bytes(bytes: &[u8]) -> Result<mdict_parser::mdict::Mdx> {
47    std::panic::catch_unwind(|| mdict_parser::parser::parse(bytes))
48        .map_err(|_| dictx_core::DictxError::InvalidData("MDX 解析失败".to_string()))
49}
50
51fn entry_from_mdx_record(filename: &str, key: &str, html: &str) -> Option<DictEntry> {
52    let word = clean_text(key);
53    if word.is_empty() || word.eq_ignore_ascii_case("freemdict") {
54        return None;
55    }
56
57    let html = html.trim_matches('\0').trim();
58    if html.is_empty() || html.starts_with("@@@LINK=") {
59        return None;
60    }
61
62    let plain = plain_text_from_html(html);
63    if plain.is_empty() {
64        return None;
65    }
66
67    let mut entry = DictEntry::new(
68        DictSource::Mdx {
69            filename: filename.to_string(),
70        },
71        word.clone(),
72    );
73    entry.tags.push("mdx".to_string());
74
75    let pos = extract_pos(&plain);
76    let mut definitions = extract_numbered_definitions(html);
77    definitions.dedup();
78    definitions.truncate(8);
79
80    if definitions.is_empty() {
81        if contains_cjk(&word) {
82            entry
83                .definitions
84                .push(Definition::new(plain.clone(), word.clone(), pos.clone()));
85        } else {
86            entry
87                .definitions
88                .push(Definition::new("", plain.clone(), pos.clone()));
89        }
90    } else {
91        for definition in definitions {
92            if contains_cjk(&word) {
93                entry
94                    .definitions
95                    .push(Definition::new(definition, word.clone(), pos.clone()));
96            } else {
97                entry
98                    .definitions
99                    .push(Definition::new("", definition, pos.clone()));
100            }
101        }
102    }
103
104    entry.examples = extract_examples(html);
105    entry.phrases = extract_related_phrases(html);
106    entry.extra = json!({
107        "format": "mdx",
108        "source_file": filename,
109        "plain_preview": plain.chars().take(240).collect::<String>(),
110    });
111
112    if entry.definitions.is_empty() && entry.examples.is_empty() && entry.phrases.is_empty() {
113        None
114    } else {
115        Some(entry)
116    }
117}
118
119fn extract_numbered_definitions(html: &str) -> Vec<String> {
120    let mut out = Vec::new();
121    for captures in numbered_definition_re().captures_iter(html) {
122        let Some(value) = captures.name("body") else {
123            continue;
124        };
125        let text = plain_text_from_html(value.as_str());
126        if !text.is_empty() && !contains_cjk(&text) {
127            out.push(text);
128        }
129    }
130    out
131}
132
133fn extract_examples(html: &str) -> Vec<Example> {
134    let mut examples = Vec::new();
135    for captures in info_cite_re().captures_iter(html) {
136        let Some(body) = captures.name("body") else {
137            continue;
138        };
139        let ps = paragraph_texts(body.as_str());
140        if ps.len() < 2 {
141            continue;
142        }
143        let left = &ps[0];
144        let right = &ps[1];
145        let (zh, en) = if contains_cjk(left) && !contains_cjk(right) {
146            (left.clone(), right.clone())
147        } else if contains_cjk(right) && !contains_cjk(left) {
148            (right.clone(), left.clone())
149        } else {
150            continue;
151        };
152        if !en.is_empty() && !zh.is_empty() {
153            examples.push(Example { en, zh });
154        }
155        if examples.len() >= 8 {
156            break;
157        }
158    }
159    examples
160}
161
162fn extract_related_phrases(html: &str) -> Vec<Phrase> {
163    let mut phrases = Vec::new();
164    for captures in related_phrase_re().captures_iter(html) {
165        let Some(raw) = captures.name("body") else {
166            continue;
167        };
168        let text = plain_text_from_html(raw.as_str());
169        let Some((zh, en)) = split_related_phrase(&text) else {
170            continue;
171        };
172        phrases.push(Phrase { en, zh });
173        if phrases.len() >= 8 {
174            break;
175        }
176    }
177    phrases
178}
179
180fn paragraph_texts(html: &str) -> Vec<String> {
181    paragraph_re()
182        .captures_iter(html)
183        .filter_map(|captures| captures.name("body"))
184        .map(|value| plain_text_from_html(value.as_str()))
185        .filter(|value| !value.is_empty())
186        .collect()
187}
188
189fn split_related_phrase(text: &str) -> Option<(String, String)> {
190    let text = clean_text(text);
191    let (zh, rest) = text.split_once(']')?;
192    let zh = zh
193        .split('[')
194        .next()
195        .map(clean_text)
196        .filter(|value| !value.is_empty())?;
197    let en = clean_text(rest);
198    if en.is_empty() {
199        None
200    } else {
201        Some((zh, en))
202    }
203}
204
205fn extract_pos(plain: &str) -> Option<String> {
206    pos_re()
207        .captures(plain)
208        .and_then(|captures| captures.get(1))
209        .map(|value| value.as_str().trim().to_string())
210        .filter(|value| !value.is_empty())
211}
212
213fn contains_cjk(value: &str) -> bool {
214    value
215        .chars()
216        .any(|ch| ('\u{4e00}'..='\u{9fff}').contains(&ch))
217}
218
219fn numbered_definition_re() -> &'static Regex {
220    static RE: OnceLock<Regex> = OnceLock::new();
221    RE.get_or_init(|| {
222        Regex::new(r#"(?is)<li[^>]*>\s*<i[^>]*class=["']number["'][^>]*>\s*\d+\s*</i>\s*<p[^>]*>(?P<body>.*?)</p>"#).unwrap()
223    })
224}
225
226fn info_cite_re() -> &'static Regex {
227    static RE: OnceLock<Regex> = OnceLock::new();
228    RE.get_or_init(|| {
229        Regex::new(r#"(?is)<div[^>]*class=["'][^"']*info-cite[^"']*["'][^>]*>(?P<body>.*?)</div>"#)
230            .unwrap()
231    })
232}
233
234fn paragraph_re() -> &'static Regex {
235    static RE: OnceLock<Regex> = OnceLock::new();
236    RE.get_or_init(|| Regex::new(r#"(?is)<p[^>]*>(?P<body>.*?)</p>"#).unwrap())
237}
238
239fn related_phrase_re() -> &'static Regex {
240    static RE: OnceLock<Regex> = OnceLock::new();
241    RE.get_or_init(|| {
242        Regex::new(r#"(?is)<p[^>]*class=["'][^"']*gray[^"']*["'][^>]*>(?P<body>.*?)</p>"#).unwrap()
243    })
244}
245
246fn pos_re() -> &'static Regex {
247    static RE: OnceLock<Regex> = OnceLock::new();
248    RE.get_or_init(|| Regex::new(r"\[([^]]+)\]").unwrap())
249}
250
251#[cfg(test)]
252mod tests {
253    use super::*;
254
255    #[test]
256    fn extracts_chinese_entry_from_mdx_html() {
257        let html = r#"
258            <span class="entry_head">苹果</span>
259            <h5><span class="bold">[名] </span></h5>
260            <ol class="info-list">
261              <li><i class="number">1</i><p>apple (the tree and its fruit)</p>
262                <div class="info-cite">
263                  <p><em>苹果</em>广泛种植于温带地区。</p>
264                  <p><span class="italic">The apple is widely grown in temperate regions.</span></p>
265                </div>
266              </li>
267            </ol>
268            <p class="gray"><span class="bold">苹果汁</span>[名] apple cider/extract</p>
269        "#;
270
271        let entry = entry_from_mdx_record("新世纪汉英大词典", "苹果", html).unwrap();
272
273        assert_eq!(entry.word, "苹果");
274        assert_eq!(entry.definitions[0].en, "apple (the tree and its fruit)");
275        assert_eq!(entry.definitions[0].zh, "苹果");
276        assert_eq!(
277            entry.examples[0].en,
278            "The apple is widely grown in temperate regions."
279        );
280        assert_eq!(entry.phrases[0].zh, "苹果汁");
281        assert_eq!(entry.phrases[0].en, "apple cider/extract");
282    }
283
284    #[test]
285    fn skips_mdx_link_records() {
286        assert!(entry_from_mdx_record("dict", "苹果", "@@@LINK=apple").is_none());
287    }
288}