1pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod html;
44pub mod latex;
45pub mod localize_images;
46pub mod markdown;
47pub mod metadata;
48pub mod postprocess;
49pub mod themed_image;
50pub mod verify;
51
52use thiserror::Error;
53
54pub const VERSION: &str = env!("CARGO_PKG_VERSION");
56
57#[derive(Error, Debug)]
59pub enum WebCaptureError {
60 #[error("Failed to fetch URL: {0}")]
61 FetchError(String),
62
63 #[error("Failed to parse HTML: {0}")]
64 ParseError(String),
65
66 #[error("Failed to convert to Markdown: {0}")]
67 MarkdownError(String),
68
69 #[error("Failed to capture screenshot: {0}")]
70 ScreenshotError(String),
71
72 #[error("Browser error: {0}")]
73 BrowserError(String),
74
75 #[error("Invalid URL: {0}")]
76 InvalidUrl(String),
77
78 #[error("IO error: {0}")]
79 IoError(#[from] std::io::Error),
80
81 #[error("Request error: {0}")]
82 RequestError(#[from] reqwest::Error),
83}
84
85pub type Result<T> = std::result::Result<T, WebCaptureError>;
87
88pub async fn fetch_html(url: &str) -> Result<String> {
105 html::fetch_html(url).await
106}
107
108pub async fn render_html(url: &str) -> Result<String> {
125 browser::render_html(url).await
126}
127
128pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
143 markdown::convert_html_to_markdown(html, base_url)
144}
145
146pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
163 browser::capture_screenshot(url).await
164}
165
166#[must_use]
177pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
178 html::convert_relative_urls(html, base_url)
179}
180
181#[must_use]
193pub fn convert_to_utf8(html: &str) -> String {
194 html::convert_to_utf8(html)
195}
196
197#[allow(clippy::struct_excessive_bools)]
199#[derive(Debug, Clone)]
200pub struct EnhancedOptions {
201 pub extract_latex: bool,
203 pub extract_metadata: bool,
205 pub post_process: bool,
207 pub detect_code_language: bool,
209 pub content_selector: Option<String>,
211 pub body_selector: Option<String>,
213}
214
215impl Default for EnhancedOptions {
216 fn default() -> Self {
217 Self {
218 extract_latex: true,
219 extract_metadata: true,
220 post_process: true,
221 detect_code_language: true,
222 content_selector: None,
223 body_selector: None,
224 }
225 }
226}
227
228#[derive(Debug, Clone)]
230pub struct EnhancedMarkdownResult {
231 pub markdown: String,
232 pub metadata: Option<metadata::ArticleMetadata>,
233}
234
235pub fn convert_html_to_markdown_enhanced(
254 html: &str,
255 base_url: Option<&str>,
256 options: &EnhancedOptions,
257) -> Result<EnhancedMarkdownResult> {
258 let mut html_for_markdown = scope_html_with_selectors(html, options);
259
260 if options.extract_latex {
261 html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
262 }
263
264 if options.detect_code_language {
265 html_for_markdown = correct_code_languages(&html_for_markdown);
266 }
267
268 let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
270
271 let extracted_metadata = if options.extract_metadata {
273 let meta = metadata::extract_metadata(html);
274 let header_lines = metadata::format_metadata_block(&meta);
276 if !header_lines.is_empty() {
277 let header = header_lines.join("\n");
278 if let Some(pos) = md.find("\n\n") {
280 md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
281 } else {
282 md = format!("{header}\n\n{md}");
283 }
284 }
285 let footer_lines = metadata::format_footer_block(&meta);
287 if !footer_lines.is_empty() {
288 md.push_str("\n\n");
289 md.push_str(&footer_lines.join("\n"));
290 }
291 Some(meta)
292 } else {
293 None
294 };
295
296 if options.post_process {
298 md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
299 }
300
301 if options.extract_latex {
302 md = normalize_extracted_latex_markdown(&md);
303 }
304
305 Ok(EnhancedMarkdownResult {
306 markdown: md,
307 metadata: extracted_metadata,
308 })
309}
310
311fn normalize_extracted_latex_markdown(markdown: &str) -> String {
312 let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
313 re.replace_all(markdown, |caps: ®ex::Captures<'_>| {
314 let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
315 format!("${formula}$")
316 })
317 .into_owned()
318}
319
320fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
321 if let Some(body_selector) = options.body_selector.as_deref() {
322 let body_html = markdown::select_html(html, body_selector);
323 let title_selector = options
324 .content_selector
325 .as_deref()
326 .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
327 let title_html = markdown::select_html(html, &title_selector);
328 return match (title_html, body_html) {
329 (Some(title), Some(body)) => format!("{title}\n{body}"),
330 (None, Some(body)) => body,
331 _ => html.to_string(),
332 };
333 }
334
335 options
336 .content_selector
337 .as_deref()
338 .and_then(|selector| markdown::select_html(html, selector))
339 .unwrap_or_else(|| html.to_string())
340}
341
342fn replace_latex_formula_elements(html: &str) -> String {
343 let mut result = html.to_string();
344
345 let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
346 result = img_formula_re
347 .replace_all(&result, |caps: ®ex::Captures<'_>| {
348 let tag = caps.get(0).map_or("", |m| m.as_str());
349 if is_formula_img_tag(tag) {
350 extract_attr(tag, "source")
351 .or_else(|| extract_attr(tag, "alt"))
352 .map_or_else(
353 || tag.to_string(),
354 |latex| format!("${}$", normalize_latex_for_html(&latex)),
355 )
356 } else {
357 tag.to_string()
358 }
359 })
360 .into_owned();
361
362 let math_attr_re = regex::Regex::new(
363 r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
364 )
365 .expect("valid regex");
366 math_attr_re
367 .replace_all(&result, |caps: ®ex::Captures<'_>| {
368 let full = caps.get(0).map_or("", |m| m.as_str());
369 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
370 let tag = caps
371 .name("tag")
372 .map_or("", |m| m.as_str())
373 .to_ascii_lowercase();
374 let tag_close = caps
375 .name("tag_close")
376 .map_or("", |m| m.as_str())
377 .to_ascii_lowercase();
378
379 if tag != tag_close || !is_math_attrs(&tag, attrs) {
380 return full.to_string();
381 }
382
383 extract_attr(attrs, "data-tex")
384 .or_else(|| extract_attr(attrs, "data-latex"))
385 .or_else(|| extract_annotation_tex(full))
386 .map_or_else(
387 || full.to_string(),
388 |latex| format!("${}$", normalize_latex_for_html(&latex)),
389 )
390 })
391 .into_owned()
392}
393
394fn correct_code_languages(html: &str) -> String {
395 let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
396 .expect("valid regex");
397
398 code_re
399 .replace_all(html, |caps: ®ex::Captures<'_>| {
400 let full = caps.get(0).map_or("", |m| m.as_str());
401 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
402 let body = caps.name("body").map_or("", |m| m.as_str());
403
404 if !has_matlab_language(attrs) || !looks_like_coq(body) {
405 return full.to_string();
406 }
407
408 let updated_attrs = attrs
409 .replace("language-matlab", "language-coq")
410 .replace(r#"class="matlab""#, r#"class="coq""#)
411 .replace("class='matlab'", "class='coq'");
412
413 format!("<code{updated_attrs}>{body}</code>")
414 })
415 .into_owned()
416}
417
418fn is_formula_img_tag(tag: &str) -> bool {
419 extract_attr(tag, "source").is_some()
420 || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
421}
422
423fn is_math_attrs(tag: &str, attrs: &str) -> bool {
424 tag == "mjx-container"
425 || extract_attr(attrs, "class").is_some_and(|classes| {
426 classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
427 })
428}
429
430fn has_matlab_language(attrs: &str) -> bool {
431 extract_attr(attrs, "class").is_some_and(|classes| {
432 classes
433 .split_whitespace()
434 .any(|class| class == "language-matlab" || class == "matlab")
435 })
436}
437
438fn looks_like_coq(text: &str) -> bool {
439 let decoded = crate::html::decode_html_entities(text);
440 [
441 "Require Import",
442 "Definition",
443 "Fixpoint",
444 "Lemma",
445 "Theorem",
446 "Proof",
447 "Qed",
448 "Notation",
449 "Inductive",
450 ]
451 .iter()
452 .any(|needle| decoded.contains(needle))
453}
454
455fn normalize_latex_for_html(latex: &str) -> String {
456 latex.trim().replace('\\', "\")
457}
458
459fn extract_annotation_tex(html: &str) -> Option<String> {
460 let re = regex::Regex::new(
461 r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
462 )
463 .ok()?;
464
465 re.captures(html).and_then(|caps| {
466 let text = caps.get(1)?.as_str().trim();
467 (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
468 })
469}
470
471fn extract_attr(tag: &str, attr: &str) -> Option<String> {
472 let re = regex::Regex::new(&format!(
473 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
474 regex::escape(attr)
475 ))
476 .ok()?;
477
478 re.captures(tag).and_then(|caps| {
479 let value = caps
480 .get(1)
481 .or_else(|| caps.get(2))
482 .or_else(|| caps.get(3))?
483 .as_str()
484 .trim();
485 (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
486 })
487}
488
489pub use browser::BrowserEngine;