1use crate::html::plain_text_from_html;
2use crate::traits::{DictParser, ValidationReport};
3use dictx_core::{clean_text, Definition, DictEntry, DictSource, Example, Phrase, Result};
4use regex::Regex;
5use serde_json::json;
6use std::path::Path;
7use std::sync::OnceLock;
8
9pub struct MdxParser;
10
11impl DictParser for MdxParser {
12 fn name(&self) -> &'static str {
13 "MDict MDX dictionary"
14 }
15
16 fn format_id(&self) -> &'static str {
17 "mdx"
18 }
19
20 fn validate(&self, path: &Path) -> Result<ValidationReport> {
21 let bytes = std::fs::read(path)?;
22 let dict = parse_mdx_bytes(&bytes)?;
23 Ok(ValidationReport::ok(
24 self.format_id(),
25 Some(dict.keys().count()),
26 ))
27 }
28
29 fn parse(&self, path: &Path) -> Result<Box<dyn Iterator<Item = Result<DictEntry>>>> {
30 let bytes = std::fs::read(path)?;
31 let dict = parse_mdx_bytes(&bytes)?;
32 let filename = path
33 .file_stem()
34 .and_then(|name| name.to_str())
35 .unwrap_or("mdx")
36 .to_string();
37 let entries = dict
38 .items()
39 .filter_map(|record| entry_from_mdx_record(&filename, record.key, &record.definition))
40 .map(Ok)
41 .collect::<Vec<_>>();
42 Ok(Box::new(entries.into_iter()))
43 }
44}
45
46fn parse_mdx_bytes(bytes: &[u8]) -> Result<mdict_parser::mdict::Mdx> {
47 std::panic::catch_unwind(|| mdict_parser::parser::parse(bytes))
48 .map_err(|_| dictx_core::DictxError::InvalidData("MDX 解析失败".to_string()))
49}
50
51fn entry_from_mdx_record(filename: &str, key: &str, html: &str) -> Option<DictEntry> {
52 let word = clean_text(key);
53 if word.is_empty() || word.eq_ignore_ascii_case("freemdict") {
54 return None;
55 }
56
57 let html = html.trim_matches('\0').trim();
58 if html.is_empty() || html.starts_with("@@@LINK=") {
59 return None;
60 }
61
62 let plain = plain_text_from_html(html);
63 if plain.is_empty() {
64 return None;
65 }
66
67 let mut entry = DictEntry::new(
68 DictSource::Mdx {
69 filename: filename.to_string(),
70 },
71 word.clone(),
72 );
73 entry.tags.push("mdx".to_string());
74
75 let pos = extract_pos(&plain);
76 let mut definitions = extract_numbered_definitions(html);
77 definitions.dedup();
78 definitions.truncate(8);
79
80 if definitions.is_empty() {
81 if contains_cjk(&word) {
82 entry
83 .definitions
84 .push(Definition::new(plain.clone(), word.clone(), pos.clone()));
85 } else {
86 entry
87 .definitions
88 .push(Definition::new("", plain.clone(), pos.clone()));
89 }
90 } else {
91 for definition in definitions {
92 if contains_cjk(&word) {
93 entry
94 .definitions
95 .push(Definition::new(definition, word.clone(), pos.clone()));
96 } else {
97 entry
98 .definitions
99 .push(Definition::new("", definition, pos.clone()));
100 }
101 }
102 }
103
104 entry.examples = extract_examples(html);
105 entry.phrases = extract_related_phrases(html);
106 entry.extra = json!({
107 "format": "mdx",
108 "source_file": filename,
109 "plain_preview": plain.chars().take(240).collect::<String>(),
110 });
111
112 if entry.definitions.is_empty() && entry.examples.is_empty() && entry.phrases.is_empty() {
113 None
114 } else {
115 Some(entry)
116 }
117}
118
119fn extract_numbered_definitions(html: &str) -> Vec<String> {
120 let mut out = Vec::new();
121 for captures in numbered_definition_re().captures_iter(html) {
122 let Some(value) = captures.name("body") else {
123 continue;
124 };
125 let text = plain_text_from_html(value.as_str());
126 if !text.is_empty() && !contains_cjk(&text) {
127 out.push(text);
128 }
129 }
130 out
131}
132
133fn extract_examples(html: &str) -> Vec<Example> {
134 let mut examples = Vec::new();
135 for captures in info_cite_re().captures_iter(html) {
136 let Some(body) = captures.name("body") else {
137 continue;
138 };
139 let ps = paragraph_texts(body.as_str());
140 if ps.len() < 2 {
141 continue;
142 }
143 let left = &ps[0];
144 let right = &ps[1];
145 let (zh, en) = if contains_cjk(left) && !contains_cjk(right) {
146 (left.clone(), right.clone())
147 } else if contains_cjk(right) && !contains_cjk(left) {
148 (right.clone(), left.clone())
149 } else {
150 continue;
151 };
152 if !en.is_empty() && !zh.is_empty() {
153 examples.push(Example { en, zh });
154 }
155 if examples.len() >= 8 {
156 break;
157 }
158 }
159 examples
160}
161
162fn extract_related_phrases(html: &str) -> Vec<Phrase> {
163 let mut phrases = Vec::new();
164 for captures in related_phrase_re().captures_iter(html) {
165 let Some(raw) = captures.name("body") else {
166 continue;
167 };
168 let text = plain_text_from_html(raw.as_str());
169 let Some((zh, en)) = split_related_phrase(&text) else {
170 continue;
171 };
172 phrases.push(Phrase { en, zh });
173 if phrases.len() >= 8 {
174 break;
175 }
176 }
177 phrases
178}
179
180fn paragraph_texts(html: &str) -> Vec<String> {
181 paragraph_re()
182 .captures_iter(html)
183 .filter_map(|captures| captures.name("body"))
184 .map(|value| plain_text_from_html(value.as_str()))
185 .filter(|value| !value.is_empty())
186 .collect()
187}
188
189fn split_related_phrase(text: &str) -> Option<(String, String)> {
190 let text = clean_text(text);
191 let (zh, rest) = text.split_once(']')?;
192 let zh = zh
193 .split('[')
194 .next()
195 .map(clean_text)
196 .filter(|value| !value.is_empty())?;
197 let en = clean_text(rest);
198 if en.is_empty() {
199 None
200 } else {
201 Some((zh, en))
202 }
203}
204
205fn extract_pos(plain: &str) -> Option<String> {
206 pos_re()
207 .captures(plain)
208 .and_then(|captures| captures.get(1))
209 .map(|value| value.as_str().trim().to_string())
210 .filter(|value| !value.is_empty())
211}
212
213fn contains_cjk(value: &str) -> bool {
214 value
215 .chars()
216 .any(|ch| ('\u{4e00}'..='\u{9fff}').contains(&ch))
217}
218
219fn numbered_definition_re() -> &'static Regex {
220 static RE: OnceLock<Regex> = OnceLock::new();
221 RE.get_or_init(|| {
222 Regex::new(r#"(?is)<li[^>]*>\s*<i[^>]*class=["']number["'][^>]*>\s*\d+\s*</i>\s*<p[^>]*>(?P<body>.*?)</p>"#).unwrap()
223 })
224}
225
226fn info_cite_re() -> &'static Regex {
227 static RE: OnceLock<Regex> = OnceLock::new();
228 RE.get_or_init(|| {
229 Regex::new(r#"(?is)<div[^>]*class=["'][^"']*info-cite[^"']*["'][^>]*>(?P<body>.*?)</div>"#)
230 .unwrap()
231 })
232}
233
234fn paragraph_re() -> &'static Regex {
235 static RE: OnceLock<Regex> = OnceLock::new();
236 RE.get_or_init(|| Regex::new(r#"(?is)<p[^>]*>(?P<body>.*?)</p>"#).unwrap())
237}
238
239fn related_phrase_re() -> &'static Regex {
240 static RE: OnceLock<Regex> = OnceLock::new();
241 RE.get_or_init(|| {
242 Regex::new(r#"(?is)<p[^>]*class=["'][^"']*gray[^"']*["'][^>]*>(?P<body>.*?)</p>"#).unwrap()
243 })
244}
245
246fn pos_re() -> &'static Regex {
247 static RE: OnceLock<Regex> = OnceLock::new();
248 RE.get_or_init(|| Regex::new(r"\[([^]]+)\]").unwrap())
249}
250
251#[cfg(test)]
252mod tests {
253 use super::*;
254
255 #[test]
256 fn extracts_chinese_entry_from_mdx_html() {
257 let html = r#"
258 <span class="entry_head">苹果</span>
259 <h5><span class="bold">[名] </span></h5>
260 <ol class="info-list">
261 <li><i class="number">1</i><p>apple (the tree and its fruit)</p>
262 <div class="info-cite">
263 <p><em>苹果</em>广泛种植于温带地区。</p>
264 <p><span class="italic">The apple is widely grown in temperate regions.</span></p>
265 </div>
266 </li>
267 </ol>
268 <p class="gray"><span class="bold">苹果汁</span>[名] apple cider/extract</p>
269 "#;
270
271 let entry = entry_from_mdx_record("新世纪汉英大词典", "苹果", html).unwrap();
272
273 assert_eq!(entry.word, "苹果");
274 assert_eq!(entry.definitions[0].en, "apple (the tree and its fruit)");
275 assert_eq!(entry.definitions[0].zh, "苹果");
276 assert_eq!(
277 entry.examples[0].en,
278 "The apple is widely grown in temperate regions."
279 );
280 assert_eq!(entry.phrases[0].zh, "苹果汁");
281 assert_eq!(entry.phrases[0].en, "apple cider/extract");
282 }
283
284 #[test]
285 fn skips_mdx_link_records() {
286 assert!(entry_from_mdx_record("dict", "苹果", "@@@LINK=apple").is_none());
287 }
288}