1pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod html;
44pub mod kreuzberg;
45pub mod latex;
46pub mod localize_images;
47pub mod markdown;
48pub mod metadata;
49pub mod postprocess;
50pub mod search;
51pub mod themed_image;
52pub mod verify;
53
54use thiserror::Error;
55
56pub const VERSION: &str = env!("CARGO_PKG_VERSION");
58
59#[derive(Error, Debug)]
61pub enum WebCaptureError {
62 #[error("Failed to fetch URL: {0}")]
63 FetchError(String),
64
65 #[error("Failed to parse HTML: {0}")]
66 ParseError(String),
67
68 #[error("Failed to convert to Markdown: {0}")]
69 MarkdownError(String),
70
71 #[error("Failed to capture screenshot: {0}")]
72 ScreenshotError(String),
73
74 #[error("Browser error: {0}")]
75 BrowserError(String),
76
77 #[error("Invalid URL: {0}")]
78 InvalidUrl(String),
79
80 #[error("IO error: {0}")]
81 IoError(#[from] std::io::Error),
82
83 #[error("Request error: {0}")]
84 RequestError(#[from] reqwest::Error),
85}
86
87pub type Result<T> = std::result::Result<T, WebCaptureError>;
89
90pub async fn fetch_html(url: &str) -> Result<String> {
107 html::fetch_html(url).await
108}
109
110pub async fn render_html(url: &str) -> Result<String> {
127 browser::render_html(url).await
128}
129
130pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
145 markdown::convert_html_to_markdown(html, base_url)
146}
147
148pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
165 browser::capture_screenshot(url).await
166}
167
168#[must_use]
179pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
180 html::convert_relative_urls(html, base_url)
181}
182
183#[must_use]
195pub fn convert_to_utf8(html: &str) -> String {
196 html::convert_to_utf8(html)
197}
198
199#[allow(clippy::struct_excessive_bools)]
201#[derive(Debug, Clone)]
202pub struct EnhancedOptions {
203 pub extract_latex: bool,
205 pub extract_metadata: bool,
207 pub post_process: bool,
209 pub detect_code_language: bool,
211 pub content_selector: Option<String>,
213 pub body_selector: Option<String>,
215}
216
217impl Default for EnhancedOptions {
218 fn default() -> Self {
219 Self {
220 extract_latex: true,
221 extract_metadata: true,
222 post_process: true,
223 detect_code_language: true,
224 content_selector: None,
225 body_selector: None,
226 }
227 }
228}
229
230#[derive(Debug, Clone)]
232pub struct EnhancedMarkdownResult {
233 pub markdown: String,
234 pub metadata: Option<metadata::ArticleMetadata>,
235}
236
237pub fn convert_html_to_markdown_enhanced(
256 html: &str,
257 base_url: Option<&str>,
258 options: &EnhancedOptions,
259) -> Result<EnhancedMarkdownResult> {
260 let mut html_for_markdown = scope_html_with_selectors(html, options);
261
262 if options.extract_latex {
263 html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
264 }
265
266 if options.detect_code_language {
267 html_for_markdown = correct_code_languages(&html_for_markdown);
268 }
269
270 let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
272
273 let extracted_metadata = if options.extract_metadata {
275 let meta = metadata::extract_metadata(html);
276 let header_lines = metadata::format_metadata_block(&meta);
278 if !header_lines.is_empty() {
279 let header = header_lines.join("\n");
280 if let Some(pos) = md.find("\n\n") {
282 md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
283 } else {
284 md = format!("{header}\n\n{md}");
285 }
286 }
287 let footer_lines = metadata::format_footer_block(&meta);
289 if !footer_lines.is_empty() {
290 md.push_str("\n\n");
291 md.push_str(&footer_lines.join("\n"));
292 }
293 Some(meta)
294 } else {
295 None
296 };
297
298 if options.post_process {
300 md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
301 }
302
303 if options.extract_latex {
304 md = normalize_extracted_latex_markdown(&md);
305 }
306
307 Ok(EnhancedMarkdownResult {
308 markdown: md,
309 metadata: extracted_metadata,
310 })
311}
312
313pub fn convert_with_kreuzberg(
332 html: &str,
333 base_url: Option<&str>,
334) -> Result<kreuzberg::KreuzbergResult> {
335 kreuzberg::convert_with_kreuzberg(html, base_url)
336}
337
338pub fn convert_with_kreuzberg_enhanced(
347 html: &str,
348 base_url: Option<&str>,
349 options: &EnhancedOptions,
350) -> Result<kreuzberg::KreuzbergResult> {
351 let scoped_html = scope_html_with_selectors(html, options);
352 kreuzberg::convert_with_kreuzberg(&scoped_html, base_url)
353}
354
355fn normalize_extracted_latex_markdown(markdown: &str) -> String {
356 let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
357 re.replace_all(markdown, |caps: ®ex::Captures<'_>| {
358 let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
359 format!("${formula}$")
360 })
361 .into_owned()
362}
363
364fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
365 if let Some(body_selector) = options.body_selector.as_deref() {
366 let body_html = markdown::select_html(html, body_selector);
367 let title_selector = options
368 .content_selector
369 .as_deref()
370 .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
371 let title_html = markdown::select_html(html, &title_selector);
372 return match (title_html, body_html) {
373 (Some(title), Some(body)) => format!("{title}\n{body}"),
374 (None, Some(body)) => body,
375 _ => html.to_string(),
376 };
377 }
378
379 options
380 .content_selector
381 .as_deref()
382 .and_then(|selector| markdown::select_html(html, selector))
383 .unwrap_or_else(|| html.to_string())
384}
385
386fn replace_latex_formula_elements(html: &str) -> String {
387 let mut result = html.to_string();
388
389 let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
390 result = img_formula_re
391 .replace_all(&result, |caps: ®ex::Captures<'_>| {
392 let tag = caps.get(0).map_or("", |m| m.as_str());
393 if is_formula_img_tag(tag) {
394 extract_attr(tag, "source")
395 .or_else(|| extract_attr(tag, "alt"))
396 .map_or_else(
397 || tag.to_string(),
398 |latex| format!("${}$", normalize_latex_for_html(&latex)),
399 )
400 } else {
401 tag.to_string()
402 }
403 })
404 .into_owned();
405
406 let math_attr_re = regex::Regex::new(
407 r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
408 )
409 .expect("valid regex");
410 math_attr_re
411 .replace_all(&result, |caps: ®ex::Captures<'_>| {
412 let full = caps.get(0).map_or("", |m| m.as_str());
413 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
414 let tag = caps
415 .name("tag")
416 .map_or("", |m| m.as_str())
417 .to_ascii_lowercase();
418 let tag_close = caps
419 .name("tag_close")
420 .map_or("", |m| m.as_str())
421 .to_ascii_lowercase();
422
423 if tag != tag_close || !is_math_attrs(&tag, attrs) {
424 return full.to_string();
425 }
426
427 extract_attr(attrs, "data-tex")
428 .or_else(|| extract_attr(attrs, "data-latex"))
429 .or_else(|| extract_annotation_tex(full))
430 .map_or_else(
431 || full.to_string(),
432 |latex| format!("${}$", normalize_latex_for_html(&latex)),
433 )
434 })
435 .into_owned()
436}
437
438fn correct_code_languages(html: &str) -> String {
439 let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
440 .expect("valid regex");
441
442 code_re
443 .replace_all(html, |caps: ®ex::Captures<'_>| {
444 let full = caps.get(0).map_or("", |m| m.as_str());
445 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
446 let body = caps.name("body").map_or("", |m| m.as_str());
447
448 if !has_matlab_language(attrs) || !looks_like_coq(body) {
449 return full.to_string();
450 }
451
452 let updated_attrs = attrs
453 .replace("language-matlab", "language-coq")
454 .replace(r#"class="matlab""#, r#"class="coq""#)
455 .replace("class='matlab'", "class='coq'");
456
457 format!("<code{updated_attrs}>{body}</code>")
458 })
459 .into_owned()
460}
461
462fn is_formula_img_tag(tag: &str) -> bool {
463 extract_attr(tag, "source").is_some()
464 || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
465}
466
467fn is_math_attrs(tag: &str, attrs: &str) -> bool {
468 tag == "mjx-container"
469 || extract_attr(attrs, "class").is_some_and(|classes| {
470 classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
471 })
472}
473
474fn has_matlab_language(attrs: &str) -> bool {
475 extract_attr(attrs, "class").is_some_and(|classes| {
476 classes
477 .split_whitespace()
478 .any(|class| class == "language-matlab" || class == "matlab")
479 })
480}
481
482fn looks_like_coq(text: &str) -> bool {
483 let decoded = crate::html::decode_html_entities(text);
484 [
485 "Require Import",
486 "Definition",
487 "Fixpoint",
488 "Lemma",
489 "Theorem",
490 "Proof",
491 "Qed",
492 "Notation",
493 "Inductive",
494 ]
495 .iter()
496 .any(|needle| decoded.contains(needle))
497}
498
499fn normalize_latex_for_html(latex: &str) -> String {
500 latex.trim().replace('\\', "\")
501}
502
503fn extract_annotation_tex(html: &str) -> Option<String> {
504 let re = regex::Regex::new(
505 r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
506 )
507 .ok()?;
508
509 re.captures(html).and_then(|caps| {
510 let text = caps.get(1)?.as_str().trim();
511 (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
512 })
513}
514
515fn extract_attr(tag: &str, attr: &str) -> Option<String> {
516 let re = regex::Regex::new(&format!(
517 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
518 regex::escape(attr)
519 ))
520 .ok()?;
521
522 re.captures(tag).and_then(|caps| {
523 let value = caps
524 .get(1)
525 .or_else(|| caps.get(2))
526 .or_else(|| caps.get(3))?
527 .as_str()
528 .trim();
529 (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
530 })
531}
532
533pub use browser::BrowserEngine;
535pub use search::{
536 search, SearchDiagnostics, SearchResult, SearchResultItem, DEFAULT_LIMIT, DEFAULT_PROVIDER,
537 SEARCH_PROVIDERS,
538};