Skip to main content

web_capture/
lib.rs

1//! # web-capture
2//!
3//! A library and CLI/microservice to render web pages as HTML, Markdown, or PNG screenshots.
4//!
5//! ## Features
6//!
7//! - Fetch HTML content from URLs
8//! - Convert HTML to Markdown
9//! - Capture PNG screenshots of web pages
10//! - Convert relative URLs to absolute URLs
11//! - Support for headless browser rendering via browser-commander
12//!
13//! ## Example
14//!
15//! ```rust,no_run
16//! use web_capture::{fetch_html, convert_html_to_markdown, capture_screenshot};
17//!
18//! #[tokio::main]
19//! async fn main() -> anyhow::Result<()> {
20//!     // Fetch HTML from a URL
21//!     let html = fetch_html("https://example.com").await?;
22//!     println!("HTML length: {}", html.len());
23//!
24//!     // Convert HTML to Markdown
25//!     let markdown = convert_html_to_markdown(&html, Some("https://example.com"))?;
26//!     println!("Markdown: {}", markdown);
27//!
28//!     // Capture a screenshot
29//!     let screenshot = capture_screenshot("https://example.com").await?;
30//!     println!("Screenshot size: {} bytes", screenshot.len());
31//!
32//!     Ok(())
33//! }
34//! ```
35
36pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod html;
44pub mod kreuzberg;
45pub mod latex;
46pub mod localize_images;
47pub mod markdown;
48pub mod metadata;
49pub mod postprocess;
50pub mod search;
51pub mod themed_image;
52pub mod verify;
53pub mod xpaste;
54
55use thiserror::Error;
56
57/// Version of the web-capture library
58pub const VERSION: &str = env!("CARGO_PKG_VERSION");
59
60/// Error types for web-capture operations
61#[derive(Error, Debug)]
62pub enum WebCaptureError {
63    #[error("Failed to fetch URL: {0}")]
64    FetchError(String),
65
66    #[error("Failed to parse HTML: {0}")]
67    ParseError(String),
68
69    #[error("Failed to convert to Markdown: {0}")]
70    MarkdownError(String),
71
72    #[error("Failed to capture screenshot: {0}")]
73    ScreenshotError(String),
74
75    #[error("Browser error: {0}")]
76    BrowserError(String),
77
78    #[error("Invalid URL: {0}")]
79    InvalidUrl(String),
80
81    #[error("IO error: {0}")]
82    IoError(#[from] std::io::Error),
83
84    #[error("Request error: {0}")]
85    RequestError(#[from] reqwest::Error),
86}
87
88/// Result type for web-capture operations
89pub type Result<T> = std::result::Result<T, WebCaptureError>;
90
91/// Fetch HTML content from a URL
92///
93/// This function makes a simple HTTP GET request to fetch the HTML content.
94/// For JavaScript-heavy pages, use `render_html` instead.
95///
96/// # Arguments
97///
98/// * `url` - The URL to fetch
99///
100/// # Returns
101///
102/// The HTML content as a string
103///
104/// # Errors
105///
106/// Returns an error if the fetch fails or the response cannot be decoded
107pub async fn fetch_html(url: &str) -> Result<String> {
108    html::fetch_html(url).await
109}
110
111/// Render HTML content from a URL using a headless browser
112///
113/// This function uses browser-commander to launch a headless browser,
114/// navigate to the URL, and return the rendered HTML content.
115///
116/// # Arguments
117///
118/// * `url` - The URL to render
119///
120/// # Returns
121///
122/// The rendered HTML content as a string
123///
124/// # Errors
125///
126/// Returns an error if browser operations fail
127pub async fn render_html(url: &str) -> Result<String> {
128    browser::render_html(url).await
129}
130
131/// Convert HTML content to Markdown
132///
133/// # Arguments
134///
135/// * `html` - The HTML content to convert
136/// * `base_url` - Optional base URL for converting relative URLs to absolute
137///
138/// # Returns
139///
140/// The Markdown content as a string
141///
142/// # Errors
143///
144/// Returns an error if conversion fails
145pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
146    markdown::convert_html_to_markdown(html, base_url)
147}
148
149/// Capture a PNG screenshot of a URL
150///
151/// This function uses browser-commander to launch a headless browser,
152/// navigate to the URL, and capture a screenshot.
153///
154/// # Arguments
155///
156/// * `url` - The URL to capture
157///
158/// # Returns
159///
160/// The PNG image data as bytes
161///
162/// # Errors
163///
164/// Returns an error if browser operations fail
165pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
166    browser::capture_screenshot(url).await
167}
168
169/// Convert relative URLs to absolute URLs in HTML content
170///
171/// # Arguments
172///
173/// * `html` - The HTML content to process
174/// * `base_url` - The base URL to use for resolving relative URLs
175///
176/// # Returns
177///
178/// The HTML content with absolute URLs
179#[must_use]
180pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
181    html::convert_relative_urls(html, base_url)
182}
183
184/// Convert HTML content to UTF-8 encoding
185///
186/// Detects the current encoding from meta tags and converts to UTF-8 if needed.
187///
188/// # Arguments
189///
190/// * `html` - The HTML content to convert
191///
192/// # Returns
193///
194/// The UTF-8 encoded HTML content
195#[must_use]
196pub fn convert_to_utf8(html: &str) -> String {
197    html::convert_to_utf8(html)
198}
199
200/// Options for enhanced HTML-to-Markdown conversion.
201#[allow(clippy::struct_excessive_bools)]
202#[derive(Debug, Clone)]
203pub struct EnhancedOptions {
204    /// Extract LaTeX formulas from img.formula, `KaTeX`, `MathJax` elements.
205    pub extract_latex: bool,
206    /// Extract article metadata (author, date, hubs, tags).
207    pub extract_metadata: bool,
208    /// Apply post-processing (unicode normalization, LaTeX spacing, etc.).
209    pub post_process: bool,
210    /// Detect and correct code block languages.
211    pub detect_code_language: bool,
212    /// CSS selector used to scope Markdown conversion.
213    pub content_selector: Option<String>,
214    /// CSS selector for article body Markdown; prepends the selected article title when available.
215    pub body_selector: Option<String>,
216}
217
218impl Default for EnhancedOptions {
219    fn default() -> Self {
220        Self {
221            extract_latex: true,
222            extract_metadata: true,
223            post_process: true,
224            detect_code_language: true,
225            content_selector: None,
226            body_selector: None,
227        }
228    }
229}
230
231/// Result of enhanced HTML-to-Markdown conversion.
232#[derive(Debug, Clone)]
233pub struct EnhancedMarkdownResult {
234    pub markdown: String,
235    pub metadata: Option<metadata::ArticleMetadata>,
236}
237
238/// Convert HTML to Markdown with enhanced options.
239///
240/// Supports LaTeX formula extraction, metadata extraction, and
241/// post-processing pipeline matching the JavaScript implementation.
242///
243/// # Arguments
244///
245/// * `html` - The HTML content to convert
246/// * `base_url` - Optional base URL for resolving relative URLs
247/// * `options` - Enhanced conversion options
248///
249/// # Returns
250///
251/// Enhanced result with markdown text and optional metadata
252///
253/// # Errors
254///
255/// Returns an error if base conversion fails
256pub fn convert_html_to_markdown_enhanced(
257    html: &str,
258    base_url: Option<&str>,
259    options: &EnhancedOptions,
260) -> Result<EnhancedMarkdownResult> {
261    let mut html_for_markdown = scope_html_with_selectors(html, options);
262
263    if options.extract_latex {
264        html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
265    }
266
267    if options.detect_code_language {
268        html_for_markdown = correct_code_languages(&html_for_markdown);
269    }
270
271    // Start with basic markdown conversion
272    let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
273
274    // Extract metadata if requested
275    let extracted_metadata = if options.extract_metadata {
276        let meta = metadata::extract_metadata(html);
277        // Prepend metadata block
278        let header_lines = metadata::format_metadata_block(&meta);
279        if !header_lines.is_empty() {
280            let header = header_lines.join("\n");
281            // Insert after the first heading
282            if let Some(pos) = md.find("\n\n") {
283                md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
284            } else {
285                md = format!("{header}\n\n{md}");
286            }
287        }
288        // Append footer block
289        let footer_lines = metadata::format_footer_block(&meta);
290        if !footer_lines.is_empty() {
291            md.push_str("\n\n");
292            md.push_str(&footer_lines.join("\n"));
293        }
294        Some(meta)
295    } else {
296        None
297    };
298
299    // Apply post-processing if requested
300    if options.post_process {
301        md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
302    }
303
304    if options.extract_latex {
305        md = normalize_extracted_latex_markdown(&md);
306    }
307
308    Ok(EnhancedMarkdownResult {
309        markdown: md,
310        metadata: extracted_metadata,
311    })
312}
313
314/// Convert HTML to Markdown using the kreuzberg html-to-markdown library.
315///
316/// Returns a structured result with content, metadata, tables, images, and warnings.
317/// This is a high-performance alternative to `convert_html_to_markdown` using the
318/// same Rust core that powers the kreuzberg ecosystem.
319///
320/// # Arguments
321///
322/// * `html` - The HTML content to convert
323/// * `base_url` - Optional base URL for converting relative URLs to absolute
324///
325/// # Returns
326///
327/// A `KreuzbergResult` with structured conversion output
328///
329/// # Errors
330///
331/// Returns an error if conversion fails
332pub fn convert_with_kreuzberg(
333    html: &str,
334    base_url: Option<&str>,
335) -> Result<kreuzberg::KreuzbergResult> {
336    kreuzberg::convert_with_kreuzberg(html, base_url)
337}
338
339/// Convert HTML to Markdown using kreuzberg after applying enhanced scoping options.
340///
341/// This keeps the alternate converter compatible with the same `contentSelector`
342/// and `bodySelector` controls used by the default enhanced converter.
343///
344/// # Errors
345///
346/// Returns an error if conversion fails.
347pub fn convert_with_kreuzberg_enhanced(
348    html: &str,
349    base_url: Option<&str>,
350    options: &EnhancedOptions,
351) -> Result<kreuzberg::KreuzbergResult> {
352    let scoped_html = scope_html_with_selectors(html, options);
353    kreuzberg::convert_with_kreuzberg(&scoped_html, base_url)
354}
355
356fn normalize_extracted_latex_markdown(markdown: &str) -> String {
357    let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
358    re.replace_all(markdown, |caps: &regex::Captures<'_>| {
359        let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
360        format!("${formula}$")
361    })
362    .into_owned()
363}
364
365fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
366    if let Some(body_selector) = options.body_selector.as_deref() {
367        let body_html = markdown::select_html(html, body_selector);
368        let title_selector = options
369            .content_selector
370            .as_deref()
371            .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
372        let title_html = markdown::select_html(html, &title_selector);
373        return match (title_html, body_html) {
374            (Some(title), Some(body)) => format!("{title}\n{body}"),
375            (None, Some(body)) => body,
376            _ => html.to_string(),
377        };
378    }
379
380    options
381        .content_selector
382        .as_deref()
383        .and_then(|selector| markdown::select_html(html, selector))
384        .unwrap_or_else(|| html.to_string())
385}
386
387fn replace_latex_formula_elements(html: &str) -> String {
388    let mut result = html.to_string();
389
390    let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
391    result = img_formula_re
392        .replace_all(&result, |caps: &regex::Captures<'_>| {
393            let tag = caps.get(0).map_or("", |m| m.as_str());
394            if is_formula_img_tag(tag) {
395                extract_attr(tag, "source")
396                    .or_else(|| extract_attr(tag, "alt"))
397                    .map_or_else(
398                        || tag.to_string(),
399                        |latex| format!("${}$", normalize_latex_for_html(&latex)),
400                    )
401            } else {
402                tag.to_string()
403            }
404        })
405        .into_owned();
406
407    let math_attr_re = regex::Regex::new(
408        r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
409    )
410    .expect("valid regex");
411    math_attr_re
412        .replace_all(&result, |caps: &regex::Captures<'_>| {
413            let full = caps.get(0).map_or("", |m| m.as_str());
414            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
415            let tag = caps
416                .name("tag")
417                .map_or("", |m| m.as_str())
418                .to_ascii_lowercase();
419            let tag_close = caps
420                .name("tag_close")
421                .map_or("", |m| m.as_str())
422                .to_ascii_lowercase();
423
424            if tag != tag_close || !is_math_attrs(&tag, attrs) {
425                return full.to_string();
426            }
427
428            extract_attr(attrs, "data-tex")
429                .or_else(|| extract_attr(attrs, "data-latex"))
430                .or_else(|| extract_annotation_tex(full))
431                .map_or_else(
432                    || full.to_string(),
433                    |latex| format!("${}$", normalize_latex_for_html(&latex)),
434                )
435        })
436        .into_owned()
437}
438
439fn correct_code_languages(html: &str) -> String {
440    let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
441        .expect("valid regex");
442
443    code_re
444        .replace_all(html, |caps: &regex::Captures<'_>| {
445            let full = caps.get(0).map_or("", |m| m.as_str());
446            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
447            let body = caps.name("body").map_or("", |m| m.as_str());
448
449            if !has_matlab_language(attrs) || !looks_like_coq(body) {
450                return full.to_string();
451            }
452
453            let updated_attrs = attrs
454                .replace("language-matlab", "language-coq")
455                .replace(r#"class="matlab""#, r#"class="coq""#)
456                .replace("class='matlab'", "class='coq'");
457
458            format!("<code{updated_attrs}>{body}</code>")
459        })
460        .into_owned()
461}
462
463fn is_formula_img_tag(tag: &str) -> bool {
464    extract_attr(tag, "source").is_some()
465        || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
466}
467
468fn is_math_attrs(tag: &str, attrs: &str) -> bool {
469    tag == "mjx-container"
470        || extract_attr(attrs, "class").is_some_and(|classes| {
471            classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
472        })
473}
474
475fn has_matlab_language(attrs: &str) -> bool {
476    extract_attr(attrs, "class").is_some_and(|classes| {
477        classes
478            .split_whitespace()
479            .any(|class| class == "language-matlab" || class == "matlab")
480    })
481}
482
483fn looks_like_coq(text: &str) -> bool {
484    let decoded = crate::html::decode_html_entities(text);
485    [
486        "Require Import",
487        "Definition",
488        "Fixpoint",
489        "Lemma",
490        "Theorem",
491        "Proof",
492        "Qed",
493        "Notation",
494        "Inductive",
495    ]
496    .iter()
497    .any(|needle| decoded.contains(needle))
498}
499
500fn normalize_latex_for_html(latex: &str) -> String {
501    latex.trim().replace('\\', "&#92;")
502}
503
504fn extract_annotation_tex(html: &str) -> Option<String> {
505    let re = regex::Regex::new(
506        r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
507    )
508    .ok()?;
509
510    re.captures(html).and_then(|caps| {
511        let text = caps.get(1)?.as_str().trim();
512        (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
513    })
514}
515
516fn extract_attr(tag: &str, attr: &str) -> Option<String> {
517    let re = regex::Regex::new(&format!(
518        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
519        regex::escape(attr)
520    ))
521    .ok()?;
522
523    re.captures(tag).and_then(|caps| {
524        let value = caps
525            .get(1)
526            .or_else(|| caps.get(2))
527            .or_else(|| caps.get(3))?
528            .as_str()
529            .trim();
530        (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
531    })
532}
533
534// Re-export commonly used types
535pub use browser::BrowserEngine;
536pub use search::{
537    search, SearchDiagnostics, SearchResult, SearchResultItem, DEFAULT_LIMIT, DEFAULT_PROVIDER,
538    SEARCH_PROVIDERS,
539};