1pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod html;
44pub mod latex;
45pub mod localize_images;
46pub mod markdown;
47pub mod metadata;
48pub mod postprocess;
49pub mod search;
50pub mod themed_image;
51pub mod verify;
52
53use thiserror::Error;
54
55pub const VERSION: &str = env!("CARGO_PKG_VERSION");
57
58#[derive(Error, Debug)]
60pub enum WebCaptureError {
61 #[error("Failed to fetch URL: {0}")]
62 FetchError(String),
63
64 #[error("Failed to parse HTML: {0}")]
65 ParseError(String),
66
67 #[error("Failed to convert to Markdown: {0}")]
68 MarkdownError(String),
69
70 #[error("Failed to capture screenshot: {0}")]
71 ScreenshotError(String),
72
73 #[error("Browser error: {0}")]
74 BrowserError(String),
75
76 #[error("Invalid URL: {0}")]
77 InvalidUrl(String),
78
79 #[error("IO error: {0}")]
80 IoError(#[from] std::io::Error),
81
82 #[error("Request error: {0}")]
83 RequestError(#[from] reqwest::Error),
84}
85
86pub type Result<T> = std::result::Result<T, WebCaptureError>;
88
89pub async fn fetch_html(url: &str) -> Result<String> {
106 html::fetch_html(url).await
107}
108
109pub async fn render_html(url: &str) -> Result<String> {
126 browser::render_html(url).await
127}
128
129pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
144 markdown::convert_html_to_markdown(html, base_url)
145}
146
147pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
164 browser::capture_screenshot(url).await
165}
166
167#[must_use]
178pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
179 html::convert_relative_urls(html, base_url)
180}
181
182#[must_use]
194pub fn convert_to_utf8(html: &str) -> String {
195 html::convert_to_utf8(html)
196}
197
198#[allow(clippy::struct_excessive_bools)]
200#[derive(Debug, Clone)]
201pub struct EnhancedOptions {
202 pub extract_latex: bool,
204 pub extract_metadata: bool,
206 pub post_process: bool,
208 pub detect_code_language: bool,
210 pub content_selector: Option<String>,
212 pub body_selector: Option<String>,
214}
215
216impl Default for EnhancedOptions {
217 fn default() -> Self {
218 Self {
219 extract_latex: true,
220 extract_metadata: true,
221 post_process: true,
222 detect_code_language: true,
223 content_selector: None,
224 body_selector: None,
225 }
226 }
227}
228
229#[derive(Debug, Clone)]
231pub struct EnhancedMarkdownResult {
232 pub markdown: String,
233 pub metadata: Option<metadata::ArticleMetadata>,
234}
235
236pub fn convert_html_to_markdown_enhanced(
255 html: &str,
256 base_url: Option<&str>,
257 options: &EnhancedOptions,
258) -> Result<EnhancedMarkdownResult> {
259 let mut html_for_markdown = scope_html_with_selectors(html, options);
260
261 if options.extract_latex {
262 html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
263 }
264
265 if options.detect_code_language {
266 html_for_markdown = correct_code_languages(&html_for_markdown);
267 }
268
269 let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
271
272 let extracted_metadata = if options.extract_metadata {
274 let meta = metadata::extract_metadata(html);
275 let header_lines = metadata::format_metadata_block(&meta);
277 if !header_lines.is_empty() {
278 let header = header_lines.join("\n");
279 if let Some(pos) = md.find("\n\n") {
281 md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
282 } else {
283 md = format!("{header}\n\n{md}");
284 }
285 }
286 let footer_lines = metadata::format_footer_block(&meta);
288 if !footer_lines.is_empty() {
289 md.push_str("\n\n");
290 md.push_str(&footer_lines.join("\n"));
291 }
292 Some(meta)
293 } else {
294 None
295 };
296
297 if options.post_process {
299 md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
300 }
301
302 if options.extract_latex {
303 md = normalize_extracted_latex_markdown(&md);
304 }
305
306 Ok(EnhancedMarkdownResult {
307 markdown: md,
308 metadata: extracted_metadata,
309 })
310}
311
312fn normalize_extracted_latex_markdown(markdown: &str) -> String {
313 let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
314 re.replace_all(markdown, |caps: ®ex::Captures<'_>| {
315 let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
316 format!("${formula}$")
317 })
318 .into_owned()
319}
320
321fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
322 if let Some(body_selector) = options.body_selector.as_deref() {
323 let body_html = markdown::select_html(html, body_selector);
324 let title_selector = options
325 .content_selector
326 .as_deref()
327 .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
328 let title_html = markdown::select_html(html, &title_selector);
329 return match (title_html, body_html) {
330 (Some(title), Some(body)) => format!("{title}\n{body}"),
331 (None, Some(body)) => body,
332 _ => html.to_string(),
333 };
334 }
335
336 options
337 .content_selector
338 .as_deref()
339 .and_then(|selector| markdown::select_html(html, selector))
340 .unwrap_or_else(|| html.to_string())
341}
342
343fn replace_latex_formula_elements(html: &str) -> String {
344 let mut result = html.to_string();
345
346 let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
347 result = img_formula_re
348 .replace_all(&result, |caps: ®ex::Captures<'_>| {
349 let tag = caps.get(0).map_or("", |m| m.as_str());
350 if is_formula_img_tag(tag) {
351 extract_attr(tag, "source")
352 .or_else(|| extract_attr(tag, "alt"))
353 .map_or_else(
354 || tag.to_string(),
355 |latex| format!("${}$", normalize_latex_for_html(&latex)),
356 )
357 } else {
358 tag.to_string()
359 }
360 })
361 .into_owned();
362
363 let math_attr_re = regex::Regex::new(
364 r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
365 )
366 .expect("valid regex");
367 math_attr_re
368 .replace_all(&result, |caps: ®ex::Captures<'_>| {
369 let full = caps.get(0).map_or("", |m| m.as_str());
370 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
371 let tag = caps
372 .name("tag")
373 .map_or("", |m| m.as_str())
374 .to_ascii_lowercase();
375 let tag_close = caps
376 .name("tag_close")
377 .map_or("", |m| m.as_str())
378 .to_ascii_lowercase();
379
380 if tag != tag_close || !is_math_attrs(&tag, attrs) {
381 return full.to_string();
382 }
383
384 extract_attr(attrs, "data-tex")
385 .or_else(|| extract_attr(attrs, "data-latex"))
386 .or_else(|| extract_annotation_tex(full))
387 .map_or_else(
388 || full.to_string(),
389 |latex| format!("${}$", normalize_latex_for_html(&latex)),
390 )
391 })
392 .into_owned()
393}
394
395fn correct_code_languages(html: &str) -> String {
396 let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
397 .expect("valid regex");
398
399 code_re
400 .replace_all(html, |caps: ®ex::Captures<'_>| {
401 let full = caps.get(0).map_or("", |m| m.as_str());
402 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
403 let body = caps.name("body").map_or("", |m| m.as_str());
404
405 if !has_matlab_language(attrs) || !looks_like_coq(body) {
406 return full.to_string();
407 }
408
409 let updated_attrs = attrs
410 .replace("language-matlab", "language-coq")
411 .replace(r#"class="matlab""#, r#"class="coq""#)
412 .replace("class='matlab'", "class='coq'");
413
414 format!("<code{updated_attrs}>{body}</code>")
415 })
416 .into_owned()
417}
418
419fn is_formula_img_tag(tag: &str) -> bool {
420 extract_attr(tag, "source").is_some()
421 || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
422}
423
424fn is_math_attrs(tag: &str, attrs: &str) -> bool {
425 tag == "mjx-container"
426 || extract_attr(attrs, "class").is_some_and(|classes| {
427 classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
428 })
429}
430
431fn has_matlab_language(attrs: &str) -> bool {
432 extract_attr(attrs, "class").is_some_and(|classes| {
433 classes
434 .split_whitespace()
435 .any(|class| class == "language-matlab" || class == "matlab")
436 })
437}
438
439fn looks_like_coq(text: &str) -> bool {
440 let decoded = crate::html::decode_html_entities(text);
441 [
442 "Require Import",
443 "Definition",
444 "Fixpoint",
445 "Lemma",
446 "Theorem",
447 "Proof",
448 "Qed",
449 "Notation",
450 "Inductive",
451 ]
452 .iter()
453 .any(|needle| decoded.contains(needle))
454}
455
456fn normalize_latex_for_html(latex: &str) -> String {
457 latex.trim().replace('\\', "\")
458}
459
460fn extract_annotation_tex(html: &str) -> Option<String> {
461 let re = regex::Regex::new(
462 r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
463 )
464 .ok()?;
465
466 re.captures(html).and_then(|caps| {
467 let text = caps.get(1)?.as_str().trim();
468 (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
469 })
470}
471
472fn extract_attr(tag: &str, attr: &str) -> Option<String> {
473 let re = regex::Regex::new(&format!(
474 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
475 regex::escape(attr)
476 ))
477 .ok()?;
478
479 re.captures(tag).and_then(|caps| {
480 let value = caps
481 .get(1)
482 .or_else(|| caps.get(2))
483 .or_else(|| caps.get(3))?
484 .as_str()
485 .trim();
486 (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
487 })
488}
489
490pub use browser::BrowserEngine;
492pub use search::{
493 search, SearchDiagnostics, SearchResult, SearchResultItem, DEFAULT_LIMIT, DEFAULT_PROVIDER,
494 SEARCH_PROVIDERS,
495};