1pub mod animation;
37pub mod batch;
38pub mod browser;
39pub mod extract_images;
40pub mod figures;
41pub mod gdocs;
42pub mod html;
43pub mod latex;
44pub mod localize_images;
45pub mod markdown;
46pub mod metadata;
47pub mod postprocess;
48pub mod themed_image;
49pub mod verify;
50
51use thiserror::Error;
52
53pub const VERSION: &str = env!("CARGO_PKG_VERSION");
55
56#[derive(Error, Debug)]
58pub enum WebCaptureError {
59 #[error("Failed to fetch URL: {0}")]
60 FetchError(String),
61
62 #[error("Failed to parse HTML: {0}")]
63 ParseError(String),
64
65 #[error("Failed to convert to Markdown: {0}")]
66 MarkdownError(String),
67
68 #[error("Failed to capture screenshot: {0}")]
69 ScreenshotError(String),
70
71 #[error("Browser error: {0}")]
72 BrowserError(String),
73
74 #[error("Invalid URL: {0}")]
75 InvalidUrl(String),
76
77 #[error("IO error: {0}")]
78 IoError(#[from] std::io::Error),
79
80 #[error("Request error: {0}")]
81 RequestError(#[from] reqwest::Error),
82}
83
84pub type Result<T> = std::result::Result<T, WebCaptureError>;
86
87pub async fn fetch_html(url: &str) -> Result<String> {
104 html::fetch_html(url).await
105}
106
107pub async fn render_html(url: &str) -> Result<String> {
124 browser::render_html(url).await
125}
126
127pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
142 markdown::convert_html_to_markdown(html, base_url)
143}
144
145pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
162 browser::capture_screenshot(url).await
163}
164
165#[must_use]
176pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
177 html::convert_relative_urls(html, base_url)
178}
179
180#[must_use]
192pub fn convert_to_utf8(html: &str) -> String {
193 html::convert_to_utf8(html)
194}
195
196#[allow(clippy::struct_excessive_bools)]
198#[derive(Debug, Clone)]
199pub struct EnhancedOptions {
200 pub extract_latex: bool,
202 pub extract_metadata: bool,
204 pub post_process: bool,
206 pub detect_code_language: bool,
208 pub content_selector: Option<String>,
210 pub body_selector: Option<String>,
212}
213
214impl Default for EnhancedOptions {
215 fn default() -> Self {
216 Self {
217 extract_latex: true,
218 extract_metadata: true,
219 post_process: true,
220 detect_code_language: true,
221 content_selector: None,
222 body_selector: None,
223 }
224 }
225}
226
227#[derive(Debug, Clone)]
229pub struct EnhancedMarkdownResult {
230 pub markdown: String,
231 pub metadata: Option<metadata::ArticleMetadata>,
232}
233
234pub fn convert_html_to_markdown_enhanced(
253 html: &str,
254 base_url: Option<&str>,
255 options: &EnhancedOptions,
256) -> Result<EnhancedMarkdownResult> {
257 let mut html_for_markdown = scope_html_with_selectors(html, options);
258
259 if options.extract_latex {
260 html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
261 }
262
263 if options.detect_code_language {
264 html_for_markdown = correct_code_languages(&html_for_markdown);
265 }
266
267 let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
269
270 let extracted_metadata = if options.extract_metadata {
272 let meta = metadata::extract_metadata(html);
273 let header_lines = metadata::format_metadata_block(&meta);
275 if !header_lines.is_empty() {
276 let header = header_lines.join("\n");
277 if let Some(pos) = md.find("\n\n") {
279 md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
280 } else {
281 md = format!("{header}\n\n{md}");
282 }
283 }
284 let footer_lines = metadata::format_footer_block(&meta);
286 if !footer_lines.is_empty() {
287 md.push_str("\n\n");
288 md.push_str(&footer_lines.join("\n"));
289 }
290 Some(meta)
291 } else {
292 None
293 };
294
295 if options.post_process {
297 md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
298 }
299
300 if options.extract_latex {
301 md = normalize_extracted_latex_markdown(&md);
302 }
303
304 Ok(EnhancedMarkdownResult {
305 markdown: md,
306 metadata: extracted_metadata,
307 })
308}
309
310fn normalize_extracted_latex_markdown(markdown: &str) -> String {
311 let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
312 re.replace_all(markdown, |caps: ®ex::Captures<'_>| {
313 let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
314 format!("${formula}$")
315 })
316 .into_owned()
317}
318
319fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
320 if let Some(body_selector) = options.body_selector.as_deref() {
321 let body_html = markdown::select_html(html, body_selector);
322 let title_selector = options
323 .content_selector
324 .as_deref()
325 .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
326 let title_html = markdown::select_html(html, &title_selector);
327 return match (title_html, body_html) {
328 (Some(title), Some(body)) => format!("{title}\n{body}"),
329 (None, Some(body)) => body,
330 _ => html.to_string(),
331 };
332 }
333
334 options
335 .content_selector
336 .as_deref()
337 .and_then(|selector| markdown::select_html(html, selector))
338 .unwrap_or_else(|| html.to_string())
339}
340
341fn replace_latex_formula_elements(html: &str) -> String {
342 let mut result = html.to_string();
343
344 let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
345 result = img_formula_re
346 .replace_all(&result, |caps: ®ex::Captures<'_>| {
347 let tag = caps.get(0).map_or("", |m| m.as_str());
348 if is_formula_img_tag(tag) {
349 extract_attr(tag, "source")
350 .or_else(|| extract_attr(tag, "alt"))
351 .map_or_else(
352 || tag.to_string(),
353 |latex| format!("${}$", normalize_latex_for_html(&latex)),
354 )
355 } else {
356 tag.to_string()
357 }
358 })
359 .into_owned();
360
361 let math_attr_re = regex::Regex::new(
362 r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
363 )
364 .expect("valid regex");
365 math_attr_re
366 .replace_all(&result, |caps: ®ex::Captures<'_>| {
367 let full = caps.get(0).map_or("", |m| m.as_str());
368 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
369 let tag = caps
370 .name("tag")
371 .map_or("", |m| m.as_str())
372 .to_ascii_lowercase();
373 let tag_close = caps
374 .name("tag_close")
375 .map_or("", |m| m.as_str())
376 .to_ascii_lowercase();
377
378 if tag != tag_close || !is_math_attrs(&tag, attrs) {
379 return full.to_string();
380 }
381
382 extract_attr(attrs, "data-tex")
383 .or_else(|| extract_attr(attrs, "data-latex"))
384 .or_else(|| extract_annotation_tex(full))
385 .map_or_else(
386 || full.to_string(),
387 |latex| format!("${}$", normalize_latex_for_html(&latex)),
388 )
389 })
390 .into_owned()
391}
392
393fn correct_code_languages(html: &str) -> String {
394 let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
395 .expect("valid regex");
396
397 code_re
398 .replace_all(html, |caps: ®ex::Captures<'_>| {
399 let full = caps.get(0).map_or("", |m| m.as_str());
400 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
401 let body = caps.name("body").map_or("", |m| m.as_str());
402
403 if !has_matlab_language(attrs) || !looks_like_coq(body) {
404 return full.to_string();
405 }
406
407 let updated_attrs = attrs
408 .replace("language-matlab", "language-coq")
409 .replace(r#"class="matlab""#, r#"class="coq""#)
410 .replace("class='matlab'", "class='coq'");
411
412 format!("<code{updated_attrs}>{body}</code>")
413 })
414 .into_owned()
415}
416
417fn is_formula_img_tag(tag: &str) -> bool {
418 extract_attr(tag, "source").is_some()
419 || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
420}
421
422fn is_math_attrs(tag: &str, attrs: &str) -> bool {
423 tag == "mjx-container"
424 || extract_attr(attrs, "class").is_some_and(|classes| {
425 classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
426 })
427}
428
429fn has_matlab_language(attrs: &str) -> bool {
430 extract_attr(attrs, "class").is_some_and(|classes| {
431 classes
432 .split_whitespace()
433 .any(|class| class == "language-matlab" || class == "matlab")
434 })
435}
436
437fn looks_like_coq(text: &str) -> bool {
438 let decoded = crate::html::decode_html_entities(text);
439 [
440 "Require Import",
441 "Definition",
442 "Fixpoint",
443 "Lemma",
444 "Theorem",
445 "Proof",
446 "Qed",
447 "Notation",
448 "Inductive",
449 ]
450 .iter()
451 .any(|needle| decoded.contains(needle))
452}
453
454fn normalize_latex_for_html(latex: &str) -> String {
455 latex.trim().replace('\\', "\")
456}
457
458fn extract_annotation_tex(html: &str) -> Option<String> {
459 let re = regex::Regex::new(
460 r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
461 )
462 .ok()?;
463
464 re.captures(html).and_then(|caps| {
465 let text = caps.get(1)?.as_str().trim();
466 (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
467 })
468}
469
470fn extract_attr(tag: &str, attr: &str) -> Option<String> {
471 let re = regex::Regex::new(&format!(
472 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
473 regex::escape(attr)
474 ))
475 .ok()?;
476
477 re.captures(tag).and_then(|caps| {
478 let value = caps
479 .get(1)
480 .or_else(|| caps.get(2))
481 .or_else(|| caps.get(3))?
482 .as_str()
483 .trim();
484 (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
485 })
486}
487
488pub use browser::BrowserEngine;