1pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod github;
44pub mod html;
45pub mod kreuzberg;
46pub mod latex;
47pub mod localize_images;
48pub mod markdown;
49pub mod metadata;
50pub mod postprocess;
51pub mod search;
52pub mod stackoverflow;
53pub mod themed_image;
54pub mod verify;
55pub mod xpaste;
56
57use thiserror::Error;
58
59pub const VERSION: &str = env!("CARGO_PKG_VERSION");
61
62#[derive(Error, Debug)]
64pub enum WebCaptureError {
65 #[error("Failed to fetch URL: {0}")]
66 FetchError(String),
67
68 #[error("Failed to parse HTML: {0}")]
69 ParseError(String),
70
71 #[error("Failed to convert to Markdown: {0}")]
72 MarkdownError(String),
73
74 #[error("Failed to capture screenshot: {0}")]
75 ScreenshotError(String),
76
77 #[error("Browser error: {0}")]
78 BrowserError(String),
79
80 #[error("Invalid URL: {0}")]
81 InvalidUrl(String),
82
83 #[error("IO error: {0}")]
84 IoError(#[from] std::io::Error),
85
86 #[error("Request error: {0}")]
87 RequestError(#[from] reqwest::Error),
88}
89
90pub type Result<T> = std::result::Result<T, WebCaptureError>;
92
93pub async fn fetch_html(url: &str) -> Result<String> {
110 html::fetch_html(url).await
111}
112
113pub async fn render_html(url: &str) -> Result<String> {
130 browser::render_html(url).await
131}
132
133pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
148 markdown::convert_html_to_markdown(html, base_url)
149}
150
151pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
168 browser::capture_screenshot(url).await
169}
170
171#[must_use]
182pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
183 html::convert_relative_urls(html, base_url)
184}
185
186#[must_use]
198pub fn convert_to_utf8(html: &str) -> String {
199 html::convert_to_utf8(html)
200}
201
202#[allow(clippy::struct_excessive_bools)]
204#[derive(Debug, Clone)]
205pub struct EnhancedOptions {
206 pub extract_latex: bool,
208 pub extract_metadata: bool,
210 pub post_process: bool,
212 pub detect_code_language: bool,
214 pub content_selector: Option<String>,
216 pub body_selector: Option<String>,
218}
219
220impl Default for EnhancedOptions {
221 fn default() -> Self {
222 Self {
223 extract_latex: true,
224 extract_metadata: true,
225 post_process: true,
226 detect_code_language: true,
227 content_selector: None,
228 body_selector: None,
229 }
230 }
231}
232
233#[derive(Debug, Clone)]
235pub struct EnhancedMarkdownResult {
236 pub markdown: String,
237 pub metadata: Option<metadata::ArticleMetadata>,
238}
239
240pub fn convert_html_to_markdown_enhanced(
259 html: &str,
260 base_url: Option<&str>,
261 options: &EnhancedOptions,
262) -> Result<EnhancedMarkdownResult> {
263 let mut html_for_markdown = scope_html_with_selectors(html, options);
264
265 if options.extract_latex {
266 html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
267 }
268
269 if options.detect_code_language {
270 html_for_markdown = correct_code_languages(&html_for_markdown);
271 }
272
273 let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
275
276 let extracted_metadata = if options.extract_metadata {
278 let meta = metadata::extract_metadata(html);
279 let header_lines = metadata::format_metadata_block(&meta);
281 if !header_lines.is_empty() {
282 let header = header_lines.join("\n");
283 if let Some(pos) = md.find("\n\n") {
285 md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
286 } else {
287 md = format!("{header}\n\n{md}");
288 }
289 }
290 let footer_lines = metadata::format_footer_block(&meta);
292 if !footer_lines.is_empty() {
293 md.push_str("\n\n");
294 md.push_str(&footer_lines.join("\n"));
295 }
296 Some(meta)
297 } else {
298 None
299 };
300
301 if options.post_process {
303 md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
304 }
305
306 if options.extract_latex {
307 md = normalize_extracted_latex_markdown(&md);
308 }
309
310 Ok(EnhancedMarkdownResult {
311 markdown: md,
312 metadata: extracted_metadata,
313 })
314}
315
316pub fn convert_with_kreuzberg(
335 html: &str,
336 base_url: Option<&str>,
337) -> Result<kreuzberg::KreuzbergResult> {
338 kreuzberg::convert_with_kreuzberg(html, base_url)
339}
340
341pub fn convert_with_kreuzberg_enhanced(
350 html: &str,
351 base_url: Option<&str>,
352 options: &EnhancedOptions,
353) -> Result<kreuzberg::KreuzbergResult> {
354 let scoped_html = scope_html_with_selectors(html, options);
355 kreuzberg::convert_with_kreuzberg(&scoped_html, base_url)
356}
357
358fn normalize_extracted_latex_markdown(markdown: &str) -> String {
359 let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
360 re.replace_all(markdown, |caps: ®ex::Captures<'_>| {
361 let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
362 format!("${formula}$")
363 })
364 .into_owned()
365}
366
367fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
368 if let Some(body_selector) = options.body_selector.as_deref() {
369 let body_html = markdown::select_html(html, body_selector);
370 let title_selector = options
371 .content_selector
372 .as_deref()
373 .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
374 let title_html = markdown::select_html(html, &title_selector);
375 return match (title_html, body_html) {
376 (Some(title), Some(body)) => format!("{title}\n{body}"),
377 (None, Some(body)) => body,
378 _ => html.to_string(),
379 };
380 }
381
382 options
383 .content_selector
384 .as_deref()
385 .and_then(|selector| markdown::select_html(html, selector))
386 .unwrap_or_else(|| html.to_string())
387}
388
389fn replace_latex_formula_elements(html: &str) -> String {
390 let mut result = html.to_string();
391
392 let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
393 result = img_formula_re
394 .replace_all(&result, |caps: ®ex::Captures<'_>| {
395 let tag = caps.get(0).map_or("", |m| m.as_str());
396 if is_formula_img_tag(tag) {
397 extract_attr(tag, "source")
398 .or_else(|| extract_attr(tag, "alt"))
399 .map_or_else(
400 || tag.to_string(),
401 |latex| format!("${}$", normalize_latex_for_html(&latex)),
402 )
403 } else {
404 tag.to_string()
405 }
406 })
407 .into_owned();
408
409 let math_attr_re = regex::Regex::new(
410 r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
411 )
412 .expect("valid regex");
413 math_attr_re
414 .replace_all(&result, |caps: ®ex::Captures<'_>| {
415 let full = caps.get(0).map_or("", |m| m.as_str());
416 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
417 let tag = caps
418 .name("tag")
419 .map_or("", |m| m.as_str())
420 .to_ascii_lowercase();
421 let tag_close = caps
422 .name("tag_close")
423 .map_or("", |m| m.as_str())
424 .to_ascii_lowercase();
425
426 if tag != tag_close || !is_math_attrs(&tag, attrs) {
427 return full.to_string();
428 }
429
430 extract_attr(attrs, "data-tex")
431 .or_else(|| extract_attr(attrs, "data-latex"))
432 .or_else(|| extract_annotation_tex(full))
433 .map_or_else(
434 || full.to_string(),
435 |latex| format!("${}$", normalize_latex_for_html(&latex)),
436 )
437 })
438 .into_owned()
439}
440
441fn correct_code_languages(html: &str) -> String {
442 let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
443 .expect("valid regex");
444
445 code_re
446 .replace_all(html, |caps: ®ex::Captures<'_>| {
447 let full = caps.get(0).map_or("", |m| m.as_str());
448 let attrs = caps.name("attrs").map_or("", |m| m.as_str());
449 let body = caps.name("body").map_or("", |m| m.as_str());
450
451 if !has_matlab_language(attrs) || !looks_like_coq(body) {
452 return full.to_string();
453 }
454
455 let updated_attrs = attrs
456 .replace("language-matlab", "language-coq")
457 .replace(r#"class="matlab""#, r#"class="coq""#)
458 .replace("class='matlab'", "class='coq'");
459
460 format!("<code{updated_attrs}>{body}</code>")
461 })
462 .into_owned()
463}
464
465fn is_formula_img_tag(tag: &str) -> bool {
466 extract_attr(tag, "source").is_some()
467 || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
468}
469
470fn is_math_attrs(tag: &str, attrs: &str) -> bool {
471 tag == "mjx-container"
472 || extract_attr(attrs, "class").is_some_and(|classes| {
473 classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
474 })
475}
476
477fn has_matlab_language(attrs: &str) -> bool {
478 extract_attr(attrs, "class").is_some_and(|classes| {
479 classes
480 .split_whitespace()
481 .any(|class| class == "language-matlab" || class == "matlab")
482 })
483}
484
485fn looks_like_coq(text: &str) -> bool {
486 let decoded = crate::html::decode_html_entities(text);
487 [
488 "Require Import",
489 "Definition",
490 "Fixpoint",
491 "Lemma",
492 "Theorem",
493 "Proof",
494 "Qed",
495 "Notation",
496 "Inductive",
497 ]
498 .iter()
499 .any(|needle| decoded.contains(needle))
500}
501
502fn normalize_latex_for_html(latex: &str) -> String {
503 latex.trim().replace('\\', "\")
504}
505
506fn extract_annotation_tex(html: &str) -> Option<String> {
507 let re = regex::Regex::new(
508 r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
509 )
510 .ok()?;
511
512 re.captures(html).and_then(|caps| {
513 let text = caps.get(1)?.as_str().trim();
514 (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
515 })
516}
517
518fn extract_attr(tag: &str, attr: &str) -> Option<String> {
519 let re = regex::Regex::new(&format!(
520 r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
521 regex::escape(attr)
522 ))
523 .ok()?;
524
525 re.captures(tag).and_then(|caps| {
526 let value = caps
527 .get(1)
528 .or_else(|| caps.get(2))
529 .or_else(|| caps.get(3))?
530 .as_str()
531 .trim();
532 (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
533 })
534}
535
536pub use browser::BrowserEngine;
538pub use search::{
539 search, SearchDiagnostics, SearchResult, SearchResultItem, DEFAULT_LIMIT, DEFAULT_PROVIDER,
540 SEARCH_PROVIDERS,
541};