1pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod github;
44pub mod html;
45pub mod kreuzberg;
46pub mod latex;
47pub mod localize_images;
48pub mod markdown;
49pub mod metadata;
50pub mod postprocess;
51pub mod search;
52pub mod themed_image;
53pub mod verify;
54pub mod xpaste;
55
56use thiserror::Error;
57
58pub const VERSION: &str = env!("CARGO_PKG_VERSION");
60
61#[derive(Error, Debug)]
63pub enum WebCaptureError {
64 #[error("Failed to fetch URL: {0}")]
65 FetchError(String),
66
67 #[error("Failed to parse HTML: {0}")]
68 ParseError(String),
69
70 #[error("Failed to convert to Markdown: {0}")]
71 MarkdownError(String),
72
73 #[error("Failed to capture screenshot: {0}")]
74 ScreenshotError(String),
75
76 #[error("Browser error: {0}")]
77 BrowserError(String),
78
79 #[error("Invalid URL: {0}")]
80 InvalidUrl(String),
81
82 #[error("IO error: {0}")]
83 IoError(#[from] std::io::Error),
84
85 #[error("Request error: {0}")]
86 RequestError(#[from] reqwest::Error),
87}
88
89pub type Result<T> = std::result::Result<T, WebCaptureError>;
91
92pub async fn fetch_html(url: &str) -> Result<String> {
109 html::fetch_html(url).await
110}
111
112pub async fn render_html(url: &str) -> Result<String> {
129 browser::render_html(url).await
130}
131
132pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
147 markdown::convert_html_to_markdown(html, base_url)
148}
149
150pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
167 browser::capture_screenshot(url).await
168}
169
170#[must_use]
181pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
182 html::convert_relative_urls(html, base_url)
183}
184
185#[must_use]
197pub fn convert_to_utf8(html: &str) -> String {
198 html::convert_to_utf8(html)
199}
200
201#[allow(clippy::struct_excessive_bools)]
203#[derive(Debug, Clone)]
204pub struct EnhancedOptions {
205 pub extract_latex: bool,
207 pub extract_metadata: bool,
209 pub post_process: bool,
211 pub detect_code_language: bool,
213 pub content_selector: Option<String>,
215 pub body_selector: Option<String>,
217}
218
219impl Default for EnhancedOptions {
220 fn default() -> Self {
221 Self {
222 extract_latex: true,
223 extract_metadata: true,
224 post_process: true,
225 detect_code_language: true,
226 content_selector: None,
227 body_selector: None,
228 }
229 }
230}
231
232#[derive(Debug, Clone)]
234pub struct EnhancedMarkdownResult {
235 pub markdown: String,
236 pub metadata: Option<metadata::ArticleMetadata>,
237}
238
239pub fn convert_html_to_markdown_enhanced(
258 html: &str,
259 base_url: Option<&str>,
260 options: &EnhancedOptions,
261) -> Result<EnhancedMarkdownResult> {
262 let mut html_for_markdown = scope_html_with_selectors(html, options);
263
264 if options.extract_latex {
265 html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
266 }
267
268 if options.detect_code_language {
269 html_for_markdown = correct_code_languages(&html_for_markdown);
270 }
271
272 let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
274
275 let extracted_metadata = if options.extract_metadata {
277 let meta = metadata::extract_metadata(html);
278 let header_lines = metadata::format_metadata_block(&meta);
280 if !header_lines.is_empty() {
281 let header = header_lines.join("\n");
282 if let Some(pos) = md.find("\n\n") {
284 md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
285 } else {
286 md = format!("{header}\n\n{md}");
287 }
288 }
289 let footer_lines = metadata::format_footer_block(&meta);
291 if !footer_lines.is_empty() {
292 md.push_str("\n\n");
293 md.push_str(&footer_lines.join("\n"));
294 }
295 Some(meta)
296 } else {
297 None
298 };
299
300 if options.post_process {
302 md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
303 }
304
305 if options.extract_latex {
306 md = normalize_extracted_latex_markdown(&md);
307 }
308
309 Ok(EnhancedMarkdownResult {
310 markdown: md,
311 metadata: extracted_metadata,
312 })
313}
314
315pub fn convert_with_kreuzberg(
334 html: &str,
335 base_url: Option<&str>,
336) -> Result<kreuzberg::KreuzbergResult> {
337 kreuzberg::convert_with_kreuzberg(html, base_url)
338}
339
340pub fn convert_with_kreuzberg_enhanced(
349 html: &str,
350 base_url: Option<&str>,
351 options: &EnhancedOptions,
352) -> Result<kreuzberg::KreuzbergResult> {
353 let scoped_html = scope_html_with_selectors(html, options);
354 kreuzberg::convert_with_kreuzberg(&scoped_html, base_url)
355}
356
357fn normalize_extracted_latex_markdown(markdown: &str) -> String {
358 let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
359 re.replace_all(markdown, |caps: ®ex::Captures<'_>| {
360 let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
361 format!("${formula}$")
362 })
363 .into_owned()
364}
365
366fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
367 if let Some(body_selector) = options.body_selector.as_deref() {
368 let body_html = markdown::select_html(html, body_selector);
369 let title_selector = options
370 .content_selector
371 .as_deref()
372 .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
373 let title_html = markdown::select_html(html, &title_selector);
374 return match (title_html, body_html) {
375 (Some(title), Some(body)) => format!("{title}\n{body}"),
376 (None, Some(body)) => body,
377 _ => html.to_string(),
378 };
379 }
380
381 options
382 .content_selector
383 .as_deref()
384 .and_then(|selector| markdown::select_html(html, selector))
385 .unwrap_or_else(|| html.to_string())
386}
387
388fn replace_latex_formula_elements(html: &str) -> String {
389 let mut result = html.to_string();
390
391 let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
392 result = img_formula_re
393 .replace_all(&result, |caps: ®ex::Captures<'_>| {
394 let tag = caps.get(0).map_or("", |m| m.as_str());
395 if is_formula_img_tag(tag) {
396 extract_attr(tag, "source")
397 .or_else(|| extract_attr(tag, "alt"))
398 .map_or_else(
399 || tag.to_string(),
400 |latex| format!("${}$", normalize_latex_for_html(&latex)),
401 )
402 } else {
403 tag.to_string()
404 }
405 })
406 .into_owned();
407
408 let math_attr_re = regex::Regex::new(
409 r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
410 )
411 .expect("valid regex");
412 math_attr_re
413 .replace_all(&result, |caps: ®ex::Captures<'_>| {
414 let full = caps.get(0).map_or("", |m| m.as_str());
415 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
416 let tag = caps
417 .name("tag")
418 .map_or("", |m| m.as_str())
419 .to_ascii_lowercase();
420 let tag_close = caps
421 .name("tag_close")
422 .map_or("", |m| m.as_str())
423 .to_ascii_lowercase();
424
425 if tag != tag_close || !is_math_attrs(&tag, attrs) {
426 return full.to_string();
427 }
428
429 extract_attr(attrs, "data-tex")
430 .or_else(|| extract_attr(attrs, "data-latex"))
431 .or_else(|| extract_annotation_tex(full))
432 .map_or_else(
433 || full.to_string(),
434 |latex| format!("${}$", normalize_latex_for_html(&latex)),
435 )
436 })
437 .into_owned()
438}
439
440fn correct_code_languages(html: &str) -> String {
441 let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
442 .expect("valid regex");
443
444 code_re
445 .replace_all(html, |caps: ®ex::Captures<'_>| {
446 let full = caps.get(0).map_or("", |m| m.as_str());
447 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
448 let body = caps.name("body").map_or("", |m| m.as_str());
449
450 if !has_matlab_language(attrs) || !looks_like_coq(body) {
451 return full.to_string();
452 }
453
454 let updated_attrs = attrs
455 .replace("language-matlab", "language-coq")
456 .replace(r#"class="matlab""#, r#"class="coq""#)
457 .replace("class='matlab'", "class='coq'");
458
459 format!("<code{updated_attrs}>{body}</code>")
460 })
461 .into_owned()
462}
463
464fn is_formula_img_tag(tag: &str) -> bool {
465 extract_attr(tag, "source").is_some()
466 || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
467}
468
469fn is_math_attrs(tag: &str, attrs: &str) -> bool {
470 tag == "mjx-container"
471 || extract_attr(attrs, "class").is_some_and(|classes| {
472 classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
473 })
474}
475
476fn has_matlab_language(attrs: &str) -> bool {
477 extract_attr(attrs, "class").is_some_and(|classes| {
478 classes
479 .split_whitespace()
480 .any(|class| class == "language-matlab" || class == "matlab")
481 })
482}
483
484fn looks_like_coq(text: &str) -> bool {
485 let decoded = crate::html::decode_html_entities(text);
486 [
487 "Require Import",
488 "Definition",
489 "Fixpoint",
490 "Lemma",
491 "Theorem",
492 "Proof",
493 "Qed",
494 "Notation",
495 "Inductive",
496 ]
497 .iter()
498 .any(|needle| decoded.contains(needle))
499}
500
501fn normalize_latex_for_html(latex: &str) -> String {
502 latex.trim().replace('\\', "\")
503}
504
505fn extract_annotation_tex(html: &str) -> Option<String> {
506 let re = regex::Regex::new(
507 r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
508 )
509 .ok()?;
510
511 re.captures(html).and_then(|caps| {
512 let text = caps.get(1)?.as_str().trim();
513 (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
514 })
515}
516
517fn extract_attr(tag: &str, attr: &str) -> Option<String> {
518 let re = regex::Regex::new(&format!(
519 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
520 regex::escape(attr)
521 ))
522 .ok()?;
523
524 re.captures(tag).and_then(|caps| {
525 let value = caps
526 .get(1)
527 .or_else(|| caps.get(2))
528 .or_else(|| caps.get(3))?
529 .as_str()
530 .trim();
531 (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
532 })
533}
534
535pub use browser::BrowserEngine;
537pub use search::{
538 search, SearchDiagnostics, SearchResult, SearchResultItem, DEFAULT_LIMIT, DEFAULT_PROVIDER,
539 SEARCH_PROVIDERS,
540};