1pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod html;
44pub mod kreuzberg;
45pub mod latex;
46pub mod localize_images;
47pub mod markdown;
48pub mod metadata;
49pub mod postprocess;
50pub mod search;
51pub mod themed_image;
52pub mod verify;
53pub mod xpaste;
54
55use thiserror::Error;
56
57pub const VERSION: &str = env!("CARGO_PKG_VERSION");
59
60#[derive(Error, Debug)]
62pub enum WebCaptureError {
63 #[error("Failed to fetch URL: {0}")]
64 FetchError(String),
65
66 #[error("Failed to parse HTML: {0}")]
67 ParseError(String),
68
69 #[error("Failed to convert to Markdown: {0}")]
70 MarkdownError(String),
71
72 #[error("Failed to capture screenshot: {0}")]
73 ScreenshotError(String),
74
75 #[error("Browser error: {0}")]
76 BrowserError(String),
77
78 #[error("Invalid URL: {0}")]
79 InvalidUrl(String),
80
81 #[error("IO error: {0}")]
82 IoError(#[from] std::io::Error),
83
84 #[error("Request error: {0}")]
85 RequestError(#[from] reqwest::Error),
86}
87
88pub type Result<T> = std::result::Result<T, WebCaptureError>;
90
91pub async fn fetch_html(url: &str) -> Result<String> {
108 html::fetch_html(url).await
109}
110
111pub async fn render_html(url: &str) -> Result<String> {
128 browser::render_html(url).await
129}
130
131pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
146 markdown::convert_html_to_markdown(html, base_url)
147}
148
149pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
166 browser::capture_screenshot(url).await
167}
168
169#[must_use]
180pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
181 html::convert_relative_urls(html, base_url)
182}
183
184#[must_use]
196pub fn convert_to_utf8(html: &str) -> String {
197 html::convert_to_utf8(html)
198}
199
200#[allow(clippy::struct_excessive_bools)]
202#[derive(Debug, Clone)]
203pub struct EnhancedOptions {
204 pub extract_latex: bool,
206 pub extract_metadata: bool,
208 pub post_process: bool,
210 pub detect_code_language: bool,
212 pub content_selector: Option<String>,
214 pub body_selector: Option<String>,
216}
217
218impl Default for EnhancedOptions {
219 fn default() -> Self {
220 Self {
221 extract_latex: true,
222 extract_metadata: true,
223 post_process: true,
224 detect_code_language: true,
225 content_selector: None,
226 body_selector: None,
227 }
228 }
229}
230
231#[derive(Debug, Clone)]
233pub struct EnhancedMarkdownResult {
234 pub markdown: String,
235 pub metadata: Option<metadata::ArticleMetadata>,
236}
237
238pub fn convert_html_to_markdown_enhanced(
257 html: &str,
258 base_url: Option<&str>,
259 options: &EnhancedOptions,
260) -> Result<EnhancedMarkdownResult> {
261 let mut html_for_markdown = scope_html_with_selectors(html, options);
262
263 if options.extract_latex {
264 html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
265 }
266
267 if options.detect_code_language {
268 html_for_markdown = correct_code_languages(&html_for_markdown);
269 }
270
271 let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
273
274 let extracted_metadata = if options.extract_metadata {
276 let meta = metadata::extract_metadata(html);
277 let header_lines = metadata::format_metadata_block(&meta);
279 if !header_lines.is_empty() {
280 let header = header_lines.join("\n");
281 if let Some(pos) = md.find("\n\n") {
283 md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
284 } else {
285 md = format!("{header}\n\n{md}");
286 }
287 }
288 let footer_lines = metadata::format_footer_block(&meta);
290 if !footer_lines.is_empty() {
291 md.push_str("\n\n");
292 md.push_str(&footer_lines.join("\n"));
293 }
294 Some(meta)
295 } else {
296 None
297 };
298
299 if options.post_process {
301 md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
302 }
303
304 if options.extract_latex {
305 md = normalize_extracted_latex_markdown(&md);
306 }
307
308 Ok(EnhancedMarkdownResult {
309 markdown: md,
310 metadata: extracted_metadata,
311 })
312}
313
314pub fn convert_with_kreuzberg(
333 html: &str,
334 base_url: Option<&str>,
335) -> Result<kreuzberg::KreuzbergResult> {
336 kreuzberg::convert_with_kreuzberg(html, base_url)
337}
338
339pub fn convert_with_kreuzberg_enhanced(
348 html: &str,
349 base_url: Option<&str>,
350 options: &EnhancedOptions,
351) -> Result<kreuzberg::KreuzbergResult> {
352 let scoped_html = scope_html_with_selectors(html, options);
353 kreuzberg::convert_with_kreuzberg(&scoped_html, base_url)
354}
355
356fn normalize_extracted_latex_markdown(markdown: &str) -> String {
357 let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
358 re.replace_all(markdown, |caps: ®ex::Captures<'_>| {
359 let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
360 format!("${formula}$")
361 })
362 .into_owned()
363}
364
365fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
366 if let Some(body_selector) = options.body_selector.as_deref() {
367 let body_html = markdown::select_html(html, body_selector);
368 let title_selector = options
369 .content_selector
370 .as_deref()
371 .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
372 let title_html = markdown::select_html(html, &title_selector);
373 return match (title_html, body_html) {
374 (Some(title), Some(body)) => format!("{title}\n{body}"),
375 (None, Some(body)) => body,
376 _ => html.to_string(),
377 };
378 }
379
380 options
381 .content_selector
382 .as_deref()
383 .and_then(|selector| markdown::select_html(html, selector))
384 .unwrap_or_else(|| html.to_string())
385}
386
387fn replace_latex_formula_elements(html: &str) -> String {
388 let mut result = html.to_string();
389
390 let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
391 result = img_formula_re
392 .replace_all(&result, |caps: ®ex::Captures<'_>| {
393 let tag = caps.get(0).map_or("", |m| m.as_str());
394 if is_formula_img_tag(tag) {
395 extract_attr(tag, "source")
396 .or_else(|| extract_attr(tag, "alt"))
397 .map_or_else(
398 || tag.to_string(),
399 |latex| format!("${}$", normalize_latex_for_html(&latex)),
400 )
401 } else {
402 tag.to_string()
403 }
404 })
405 .into_owned();
406
407 let math_attr_re = regex::Regex::new(
408 r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
409 )
410 .expect("valid regex");
411 math_attr_re
412 .replace_all(&result, |caps: ®ex::Captures<'_>| {
413 let full = caps.get(0).map_or("", |m| m.as_str());
414 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
415 let tag = caps
416 .name("tag")
417 .map_or("", |m| m.as_str())
418 .to_ascii_lowercase();
419 let tag_close = caps
420 .name("tag_close")
421 .map_or("", |m| m.as_str())
422 .to_ascii_lowercase();
423
424 if tag != tag_close || !is_math_attrs(&tag, attrs) {
425 return full.to_string();
426 }
427
428 extract_attr(attrs, "data-tex")
429 .or_else(|| extract_attr(attrs, "data-latex"))
430 .or_else(|| extract_annotation_tex(full))
431 .map_or_else(
432 || full.to_string(),
433 |latex| format!("${}$", normalize_latex_for_html(&latex)),
434 )
435 })
436 .into_owned()
437}
438
439fn correct_code_languages(html: &str) -> String {
440 let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
441 .expect("valid regex");
442
443 code_re
444 .replace_all(html, |caps: ®ex::Captures<'_>| {
445 let full = caps.get(0).map_or("", |m| m.as_str());
446 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
447 let body = caps.name("body").map_or("", |m| m.as_str());
448
449 if !has_matlab_language(attrs) || !looks_like_coq(body) {
450 return full.to_string();
451 }
452
453 let updated_attrs = attrs
454 .replace("language-matlab", "language-coq")
455 .replace(r#"class="matlab""#, r#"class="coq""#)
456 .replace("class='matlab'", "class='coq'");
457
458 format!("<code{updated_attrs}>{body}</code>")
459 })
460 .into_owned()
461}
462
463fn is_formula_img_tag(tag: &str) -> bool {
464 extract_attr(tag, "source").is_some()
465 || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
466}
467
468fn is_math_attrs(tag: &str, attrs: &str) -> bool {
469 tag == "mjx-container"
470 || extract_attr(attrs, "class").is_some_and(|classes| {
471 classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
472 })
473}
474
475fn has_matlab_language(attrs: &str) -> bool {
476 extract_attr(attrs, "class").is_some_and(|classes| {
477 classes
478 .split_whitespace()
479 .any(|class| class == "language-matlab" || class == "matlab")
480 })
481}
482
483fn looks_like_coq(text: &str) -> bool {
484 let decoded = crate::html::decode_html_entities(text);
485 [
486 "Require Import",
487 "Definition",
488 "Fixpoint",
489 "Lemma",
490 "Theorem",
491 "Proof",
492 "Qed",
493 "Notation",
494 "Inductive",
495 ]
496 .iter()
497 .any(|needle| decoded.contains(needle))
498}
499
500fn normalize_latex_for_html(latex: &str) -> String {
501 latex.trim().replace('\\', "\")
502}
503
504fn extract_annotation_tex(html: &str) -> Option<String> {
505 let re = regex::Regex::new(
506 r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
507 )
508 .ok()?;
509
510 re.captures(html).and_then(|caps| {
511 let text = caps.get(1)?.as_str().trim();
512 (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
513 })
514}
515
516fn extract_attr(tag: &str, attr: &str) -> Option<String> {
517 let re = regex::Regex::new(&format!(
518 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
519 regex::escape(attr)
520 ))
521 .ok()?;
522
523 re.captures(tag).and_then(|caps| {
524 let value = caps
525 .get(1)
526 .or_else(|| caps.get(2))
527 .or_else(|| caps.get(3))?
528 .as_str()
529 .trim();
530 (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
531 })
532}
533
534pub use browser::BrowserEngine;
536pub use search::{
537 search, SearchDiagnostics, SearchResult, SearchResultItem, DEFAULT_LIMIT, DEFAULT_PROVIDER,
538 SEARCH_PROVIDERS,
539};