Skip to main content

web_capture/
lib.rs

1//! # web-capture
2//!
3//! A library and CLI/microservice to render web pages as HTML, Markdown, or PNG screenshots.
4//!
5//! ## Features
6//!
7//! - Fetch HTML content from URLs
8//! - Convert HTML to Markdown
9//! - Capture PNG screenshots of web pages
10//! - Convert relative URLs to absolute URLs
11//! - Support for headless browser rendering via browser-commander
12//!
13//! ## Example
14//!
15//! ```rust,no_run
16//! use web_capture::{fetch_html, convert_html_to_markdown, capture_screenshot};
17//!
18//! #[tokio::main]
19//! async fn main() -> anyhow::Result<()> {
20//!     // Fetch HTML from a URL
21//!     let html = fetch_html("https://example.com").await?;
22//!     println!("HTML length: {}", html.len());
23//!
24//!     // Convert HTML to Markdown
25//!     let markdown = convert_html_to_markdown(&html, Some("https://example.com"))?;
26//!     println!("Markdown: {}", markdown);
27//!
28//!     // Capture a screenshot
29//!     let screenshot = capture_screenshot("https://example.com").await?;
30//!     println!("Screenshot size: {} bytes", screenshot.len());
31//!
32//!     Ok(())
33//! }
34//! ```
35
36pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod github;
44pub mod html;
45pub mod kreuzberg;
46pub mod latex;
47pub mod localize_images;
48pub mod markdown;
49pub mod metadata;
50pub mod postprocess;
51pub mod search;
52pub mod stackoverflow;
53pub mod themed_image;
54pub mod verify;
55pub mod xpaste;
56
57use thiserror::Error;
58
59/// Version of the web-capture library
60pub const VERSION: &str = env!("CARGO_PKG_VERSION");
61
62/// Error types for web-capture operations
63#[derive(Error, Debug)]
64pub enum WebCaptureError {
65    #[error("Failed to fetch URL: {0}")]
66    FetchError(String),
67
68    #[error("Failed to parse HTML: {0}")]
69    ParseError(String),
70
71    #[error("Failed to convert to Markdown: {0}")]
72    MarkdownError(String),
73
74    #[error("Failed to capture screenshot: {0}")]
75    ScreenshotError(String),
76
77    #[error("Browser error: {0}")]
78    BrowserError(String),
79
80    #[error("Invalid URL: {0}")]
81    InvalidUrl(String),
82
83    #[error("IO error: {0}")]
84    IoError(#[from] std::io::Error),
85
86    #[error("Request error: {0}")]
87    RequestError(#[from] reqwest::Error),
88}
89
90/// Result type for web-capture operations
91pub type Result<T> = std::result::Result<T, WebCaptureError>;
92
93/// Fetch HTML content from a URL
94///
95/// This function makes a simple HTTP GET request to fetch the HTML content.
96/// For JavaScript-heavy pages, use `render_html` instead.
97///
98/// # Arguments
99///
100/// * `url` - The URL to fetch
101///
102/// # Returns
103///
104/// The HTML content as a string
105///
106/// # Errors
107///
108/// Returns an error if the fetch fails or the response cannot be decoded
109pub async fn fetch_html(url: &str) -> Result<String> {
110    html::fetch_html(url).await
111}
112
113/// Render HTML content from a URL using a headless browser
114///
115/// This function uses browser-commander to launch a headless browser,
116/// navigate to the URL, and return the rendered HTML content.
117///
118/// # Arguments
119///
120/// * `url` - The URL to render
121///
122/// # Returns
123///
124/// The rendered HTML content as a string
125///
126/// # Errors
127///
128/// Returns an error if browser operations fail
129pub async fn render_html(url: &str) -> Result<String> {
130    browser::render_html(url).await
131}
132
133/// Convert HTML content to Markdown
134///
135/// # Arguments
136///
137/// * `html` - The HTML content to convert
138/// * `base_url` - Optional base URL for converting relative URLs to absolute
139///
140/// # Returns
141///
142/// The Markdown content as a string
143///
144/// # Errors
145///
146/// Returns an error if conversion fails
147pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
148    markdown::convert_html_to_markdown(html, base_url)
149}
150
151/// Capture a PNG screenshot of a URL
152///
153/// This function uses browser-commander to launch a headless browser,
154/// navigate to the URL, and capture a screenshot.
155///
156/// # Arguments
157///
158/// * `url` - The URL to capture
159///
160/// # Returns
161///
162/// The PNG image data as bytes
163///
164/// # Errors
165///
166/// Returns an error if browser operations fail
167pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
168    browser::capture_screenshot(url).await
169}
170
171/// Convert relative URLs to absolute URLs in HTML content
172///
173/// # Arguments
174///
175/// * `html` - The HTML content to process
176/// * `base_url` - The base URL to use for resolving relative URLs
177///
178/// # Returns
179///
180/// The HTML content with absolute URLs
181#[must_use]
182pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
183    html::convert_relative_urls(html, base_url)
184}
185
186/// Convert HTML content to UTF-8 encoding
187///
188/// Detects the current encoding from meta tags and converts to UTF-8 if needed.
189///
190/// # Arguments
191///
192/// * `html` - The HTML content to convert
193///
194/// # Returns
195///
196/// The UTF-8 encoded HTML content
197#[must_use]
198pub fn convert_to_utf8(html: &str) -> String {
199    html::convert_to_utf8(html)
200}
201
202/// Options for enhanced HTML-to-Markdown conversion.
203#[allow(clippy::struct_excessive_bools)]
204#[derive(Debug, Clone)]
205pub struct EnhancedOptions {
206    /// Extract LaTeX formulas from img.formula, `KaTeX`, `MathJax` elements.
207    pub extract_latex: bool,
208    /// Extract article metadata (author, date, hubs, tags).
209    pub extract_metadata: bool,
210    /// Apply post-processing (unicode normalization, LaTeX spacing, etc.).
211    pub post_process: bool,
212    /// Detect and correct code block languages.
213    pub detect_code_language: bool,
214    /// CSS selector used to scope Markdown conversion.
215    pub content_selector: Option<String>,
216    /// CSS selector for article body Markdown; prepends the selected article title when available.
217    pub body_selector: Option<String>,
218}
219
220impl Default for EnhancedOptions {
221    fn default() -> Self {
222        Self {
223            extract_latex: true,
224            extract_metadata: true,
225            post_process: true,
226            detect_code_language: true,
227            content_selector: None,
228            body_selector: None,
229        }
230    }
231}
232
233/// Result of enhanced HTML-to-Markdown conversion.
234#[derive(Debug, Clone)]
235pub struct EnhancedMarkdownResult {
236    pub markdown: String,
237    pub metadata: Option<metadata::ArticleMetadata>,
238}
239
240/// Convert HTML to Markdown with enhanced options.
241///
242/// Supports LaTeX formula extraction, metadata extraction, and
243/// post-processing pipeline matching the JavaScript implementation.
244///
245/// # Arguments
246///
247/// * `html` - The HTML content to convert
248/// * `base_url` - Optional base URL for resolving relative URLs
249/// * `options` - Enhanced conversion options
250///
251/// # Returns
252///
253/// Enhanced result with markdown text and optional metadata
254///
255/// # Errors
256///
257/// Returns an error if base conversion fails
258pub fn convert_html_to_markdown_enhanced(
259    html: &str,
260    base_url: Option<&str>,
261    options: &EnhancedOptions,
262) -> Result<EnhancedMarkdownResult> {
263    let mut html_for_markdown = scope_html_with_selectors(html, options);
264
265    if options.extract_latex {
266        html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
267    }
268
269    if options.detect_code_language {
270        html_for_markdown = correct_code_languages(&html_for_markdown);
271    }
272
273    // Start with basic markdown conversion
274    let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
275
276    // Extract metadata if requested
277    let extracted_metadata = if options.extract_metadata {
278        let meta = metadata::extract_metadata(html);
279        // Prepend metadata block
280        let header_lines = metadata::format_metadata_block(&meta);
281        if !header_lines.is_empty() {
282            let header = header_lines.join("\n");
283            // Insert after the first heading
284            if let Some(pos) = md.find("\n\n") {
285                md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
286            } else {
287                md = format!("{header}\n\n{md}");
288            }
289        }
290        // Append footer block
291        let footer_lines = metadata::format_footer_block(&meta);
292        if !footer_lines.is_empty() {
293            md.push_str("\n\n");
294            md.push_str(&footer_lines.join("\n"));
295        }
296        Some(meta)
297    } else {
298        None
299    };
300
301    // Apply post-processing if requested
302    if options.post_process {
303        md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
304    }
305
306    if options.extract_latex {
307        md = normalize_extracted_latex_markdown(&md);
308    }
309
310    Ok(EnhancedMarkdownResult {
311        markdown: md,
312        metadata: extracted_metadata,
313    })
314}
315
316/// Convert HTML to Markdown using the kreuzberg html-to-markdown library.
317///
318/// Returns a structured result with content, metadata, tables, images, and warnings.
319/// This is a high-performance alternative to `convert_html_to_markdown` using the
320/// same Rust core that powers the kreuzberg ecosystem.
321///
322/// # Arguments
323///
324/// * `html` - The HTML content to convert
325/// * `base_url` - Optional base URL for converting relative URLs to absolute
326///
327/// # Returns
328///
329/// A `KreuzbergResult` with structured conversion output
330///
331/// # Errors
332///
333/// Returns an error if conversion fails
334pub fn convert_with_kreuzberg(
335    html: &str,
336    base_url: Option<&str>,
337) -> Result<kreuzberg::KreuzbergResult> {
338    kreuzberg::convert_with_kreuzberg(html, base_url)
339}
340
341/// Convert HTML to Markdown using kreuzberg after applying enhanced scoping options.
342///
343/// This keeps the alternate converter compatible with the same `contentSelector`
344/// and `bodySelector` controls used by the default enhanced converter.
345///
346/// # Errors
347///
348/// Returns an error if conversion fails.
349pub fn convert_with_kreuzberg_enhanced(
350    html: &str,
351    base_url: Option<&str>,
352    options: &EnhancedOptions,
353) -> Result<kreuzberg::KreuzbergResult> {
354    let scoped_html = scope_html_with_selectors(html, options);
355    kreuzberg::convert_with_kreuzberg(&scoped_html, base_url)
356}
357
358fn normalize_extracted_latex_markdown(markdown: &str) -> String {
359    let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
360    re.replace_all(markdown, |caps: &regex::Captures<'_>| {
361        let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
362        format!("${formula}$")
363    })
364    .into_owned()
365}
366
367fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
368    if let Some(body_selector) = options.body_selector.as_deref() {
369        let body_html = markdown::select_html(html, body_selector);
370        let title_selector = options
371            .content_selector
372            .as_deref()
373            .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
374        let title_html = markdown::select_html(html, &title_selector);
375        return match (title_html, body_html) {
376            (Some(title), Some(body)) => format!("{title}\n{body}"),
377            (None, Some(body)) => body,
378            _ => html.to_string(),
379        };
380    }
381
382    options
383        .content_selector
384        .as_deref()
385        .and_then(|selector| markdown::select_html(html, selector))
386        .unwrap_or_else(|| html.to_string())
387}
388
389fn replace_latex_formula_elements(html: &str) -> String {
390    let mut result = html.to_string();
391
392    let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
393    result = img_formula_re
394        .replace_all(&result, |caps: &regex::Captures<'_>| {
395            let tag = caps.get(0).map_or("", |m| m.as_str());
396            if is_formula_img_tag(tag) {
397                extract_attr(tag, "source")
398                    .or_else(|| extract_attr(tag, "alt"))
399                    .map_or_else(
400                        || tag.to_string(),
401                        |latex| format!("${}$", normalize_latex_for_html(&latex)),
402                    )
403            } else {
404                tag.to_string()
405            }
406        })
407        .into_owned();
408
409    let math_attr_re = regex::Regex::new(
410        r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
411    )
412    .expect("valid regex");
413    math_attr_re
414        .replace_all(&result, |caps: &regex::Captures<'_>| {
415            let full = caps.get(0).map_or("", |m| m.as_str());
416            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
417            let tag = caps
418                .name("tag")
419                .map_or("", |m| m.as_str())
420                .to_ascii_lowercase();
421            let tag_close = caps
422                .name("tag_close")
423                .map_or("", |m| m.as_str())
424                .to_ascii_lowercase();
425
426            if tag != tag_close || !is_math_attrs(&tag, attrs) {
427                return full.to_string();
428            }
429
430            extract_attr(attrs, "data-tex")
431                .or_else(|| extract_attr(attrs, "data-latex"))
432                .or_else(|| extract_annotation_tex(full))
433                .map_or_else(
434                    || full.to_string(),
435                    |latex| format!("${}$", normalize_latex_for_html(&latex)),
436                )
437        })
438        .into_owned()
439}
440
441fn correct_code_languages(html: &str) -> String {
442    let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
443        .expect("valid regex");
444
445    code_re
446        .replace_all(html, |caps: &regex::Captures<'_>| {
447            let full = caps.get(0).map_or("", |m| m.as_str());
448            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
449            let body = caps.name("body").map_or("", |m| m.as_str());
450
451            if !has_matlab_language(attrs) || !looks_like_coq(body) {
452                return full.to_string();
453            }
454
455            let updated_attrs = attrs
456                .replace("language-matlab", "language-coq")
457                .replace(r#"class="matlab""#, r#"class="coq""#)
458                .replace("class='matlab'", "class='coq'");
459
460            format!("<code{updated_attrs}>{body}</code>")
461        })
462        .into_owned()
463}
464
465fn is_formula_img_tag(tag: &str) -> bool {
466    extract_attr(tag, "source").is_some()
467        || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
468}
469
470fn is_math_attrs(tag: &str, attrs: &str) -> bool {
471    tag == "mjx-container"
472        || extract_attr(attrs, "class").is_some_and(|classes| {
473            classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
474        })
475}
476
477fn has_matlab_language(attrs: &str) -> bool {
478    extract_attr(attrs, "class").is_some_and(|classes| {
479        classes
480            .split_whitespace()
481            .any(|class| class == "language-matlab" || class == "matlab")
482    })
483}
484
485fn looks_like_coq(text: &str) -> bool {
486    let decoded = crate::html::decode_html_entities(text);
487    [
488        "Require Import",
489        "Definition",
490        "Fixpoint",
491        "Lemma",
492        "Theorem",
493        "Proof",
494        "Qed",
495        "Notation",
496        "Inductive",
497    ]
498    .iter()
499    .any(|needle| decoded.contains(needle))
500}
501
502fn normalize_latex_for_html(latex: &str) -> String {
503    latex.trim().replace('\\', "&#92;")
504}
505
506fn extract_annotation_tex(html: &str) -> Option<String> {
507    let re = regex::Regex::new(
508        r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
509    )
510    .ok()?;
511
512    re.captures(html).and_then(|caps| {
513        let text = caps.get(1)?.as_str().trim();
514        (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
515    })
516}
517
518fn extract_attr(tag: &str, attr: &str) -> Option<String> {
519    let re = regex::Regex::new(&format!(
520        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
521        regex::escape(attr)
522    ))
523    .ok()?;
524
525    re.captures(tag).and_then(|caps| {
526        let value = caps
527            .get(1)
528            .or_else(|| caps.get(2))
529            .or_else(|| caps.get(3))?
530            .as_str()
531            .trim();
532        (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
533    })
534}
535
536// Re-export commonly used types
537pub use browser::BrowserEngine;
538pub use search::{
539    search, SearchDiagnostics, SearchResult, SearchResultItem, DEFAULT_LIMIT, DEFAULT_PROVIDER,
540    SEARCH_PROVIDERS,
541};