Skip to main content

web_capture/
lib.rs

1//! # web-capture
2//!
3//! A library and CLI/microservice to render web pages as HTML, Markdown, or PNG screenshots.
4//!
5//! ## Features
6//!
7//! - Fetch HTML content from URLs
8//! - Convert HTML to Markdown
9//! - Capture PNG screenshots of web pages
10//! - Convert relative URLs to absolute URLs
11//! - Support for headless browser rendering via browser-commander
12//!
13//! ## Example
14//!
15//! ```rust,no_run
16//! use web_capture::{fetch_html, convert_html_to_markdown, capture_screenshot};
17//!
18//! #[tokio::main]
19//! async fn main() -> anyhow::Result<()> {
20//!     // Fetch HTML from a URL
21//!     let html = fetch_html("https://example.com").await?;
22//!     println!("HTML length: {}", html.len());
23//!
24//!     // Convert HTML to Markdown
25//!     let markdown = convert_html_to_markdown(&html, Some("https://example.com"))?;
26//!     println!("Markdown: {}", markdown);
27//!
28//!     // Capture a screenshot
29//!     let screenshot = capture_screenshot("https://example.com").await?;
30//!     println!("Screenshot size: {} bytes", screenshot.len());
31//!
32//!     Ok(())
33//! }
34//! ```
35
36pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod html;
44pub mod kreuzberg;
45pub mod latex;
46pub mod localize_images;
47pub mod markdown;
48pub mod metadata;
49pub mod postprocess;
50pub mod search;
51pub mod themed_image;
52pub mod verify;
53
54use thiserror::Error;
55
56/// Version of the web-capture library
57pub const VERSION: &str = env!("CARGO_PKG_VERSION");
58
59/// Error types for web-capture operations
60#[derive(Error, Debug)]
61pub enum WebCaptureError {
62    #[error("Failed to fetch URL: {0}")]
63    FetchError(String),
64
65    #[error("Failed to parse HTML: {0}")]
66    ParseError(String),
67
68    #[error("Failed to convert to Markdown: {0}")]
69    MarkdownError(String),
70
71    #[error("Failed to capture screenshot: {0}")]
72    ScreenshotError(String),
73
74    #[error("Browser error: {0}")]
75    BrowserError(String),
76
77    #[error("Invalid URL: {0}")]
78    InvalidUrl(String),
79
80    #[error("IO error: {0}")]
81    IoError(#[from] std::io::Error),
82
83    #[error("Request error: {0}")]
84    RequestError(#[from] reqwest::Error),
85}
86
87/// Result type for web-capture operations
88pub type Result<T> = std::result::Result<T, WebCaptureError>;
89
90/// Fetch HTML content from a URL
91///
92/// This function makes a simple HTTP GET request to fetch the HTML content.
93/// For JavaScript-heavy pages, use `render_html` instead.
94///
95/// # Arguments
96///
97/// * `url` - The URL to fetch
98///
99/// # Returns
100///
101/// The HTML content as a string
102///
103/// # Errors
104///
105/// Returns an error if the fetch fails or the response cannot be decoded
106pub async fn fetch_html(url: &str) -> Result<String> {
107    html::fetch_html(url).await
108}
109
110/// Render HTML content from a URL using a headless browser
111///
112/// This function uses browser-commander to launch a headless browser,
113/// navigate to the URL, and return the rendered HTML content.
114///
115/// # Arguments
116///
117/// * `url` - The URL to render
118///
119/// # Returns
120///
121/// The rendered HTML content as a string
122///
123/// # Errors
124///
125/// Returns an error if browser operations fail
126pub async fn render_html(url: &str) -> Result<String> {
127    browser::render_html(url).await
128}
129
130/// Convert HTML content to Markdown
131///
132/// # Arguments
133///
134/// * `html` - The HTML content to convert
135/// * `base_url` - Optional base URL for converting relative URLs to absolute
136///
137/// # Returns
138///
139/// The Markdown content as a string
140///
141/// # Errors
142///
143/// Returns an error if conversion fails
144pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
145    markdown::convert_html_to_markdown(html, base_url)
146}
147
148/// Capture a PNG screenshot of a URL
149///
150/// This function uses browser-commander to launch a headless browser,
151/// navigate to the URL, and capture a screenshot.
152///
153/// # Arguments
154///
155/// * `url` - The URL to capture
156///
157/// # Returns
158///
159/// The PNG image data as bytes
160///
161/// # Errors
162///
163/// Returns an error if browser operations fail
164pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
165    browser::capture_screenshot(url).await
166}
167
168/// Convert relative URLs to absolute URLs in HTML content
169///
170/// # Arguments
171///
172/// * `html` - The HTML content to process
173/// * `base_url` - The base URL to use for resolving relative URLs
174///
175/// # Returns
176///
177/// The HTML content with absolute URLs
178#[must_use]
179pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
180    html::convert_relative_urls(html, base_url)
181}
182
183/// Convert HTML content to UTF-8 encoding
184///
185/// Detects the current encoding from meta tags and converts to UTF-8 if needed.
186///
187/// # Arguments
188///
189/// * `html` - The HTML content to convert
190///
191/// # Returns
192///
193/// The UTF-8 encoded HTML content
194#[must_use]
195pub fn convert_to_utf8(html: &str) -> String {
196    html::convert_to_utf8(html)
197}
198
199/// Options for enhanced HTML-to-Markdown conversion.
200#[allow(clippy::struct_excessive_bools)]
201#[derive(Debug, Clone)]
202pub struct EnhancedOptions {
203    /// Extract LaTeX formulas from img.formula, `KaTeX`, `MathJax` elements.
204    pub extract_latex: bool,
205    /// Extract article metadata (author, date, hubs, tags).
206    pub extract_metadata: bool,
207    /// Apply post-processing (unicode normalization, LaTeX spacing, etc.).
208    pub post_process: bool,
209    /// Detect and correct code block languages.
210    pub detect_code_language: bool,
211    /// CSS selector used to scope Markdown conversion.
212    pub content_selector: Option<String>,
213    /// CSS selector for article body Markdown; prepends the selected article title when available.
214    pub body_selector: Option<String>,
215}
216
217impl Default for EnhancedOptions {
218    fn default() -> Self {
219        Self {
220            extract_latex: true,
221            extract_metadata: true,
222            post_process: true,
223            detect_code_language: true,
224            content_selector: None,
225            body_selector: None,
226        }
227    }
228}
229
230/// Result of enhanced HTML-to-Markdown conversion.
231#[derive(Debug, Clone)]
232pub struct EnhancedMarkdownResult {
233    pub markdown: String,
234    pub metadata: Option<metadata::ArticleMetadata>,
235}
236
237/// Convert HTML to Markdown with enhanced options.
238///
239/// Supports LaTeX formula extraction, metadata extraction, and
240/// post-processing pipeline matching the JavaScript implementation.
241///
242/// # Arguments
243///
244/// * `html` - The HTML content to convert
245/// * `base_url` - Optional base URL for resolving relative URLs
246/// * `options` - Enhanced conversion options
247///
248/// # Returns
249///
250/// Enhanced result with markdown text and optional metadata
251///
252/// # Errors
253///
254/// Returns an error if base conversion fails
255pub fn convert_html_to_markdown_enhanced(
256    html: &str,
257    base_url: Option<&str>,
258    options: &EnhancedOptions,
259) -> Result<EnhancedMarkdownResult> {
260    let mut html_for_markdown = scope_html_with_selectors(html, options);
261
262    if options.extract_latex {
263        html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
264    }
265
266    if options.detect_code_language {
267        html_for_markdown = correct_code_languages(&html_for_markdown);
268    }
269
270    // Start with basic markdown conversion
271    let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
272
273    // Extract metadata if requested
274    let extracted_metadata = if options.extract_metadata {
275        let meta = metadata::extract_metadata(html);
276        // Prepend metadata block
277        let header_lines = metadata::format_metadata_block(&meta);
278        if !header_lines.is_empty() {
279            let header = header_lines.join("\n");
280            // Insert after the first heading
281            if let Some(pos) = md.find("\n\n") {
282                md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
283            } else {
284                md = format!("{header}\n\n{md}");
285            }
286        }
287        // Append footer block
288        let footer_lines = metadata::format_footer_block(&meta);
289        if !footer_lines.is_empty() {
290            md.push_str("\n\n");
291            md.push_str(&footer_lines.join("\n"));
292        }
293        Some(meta)
294    } else {
295        None
296    };
297
298    // Apply post-processing if requested
299    if options.post_process {
300        md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
301    }
302
303    if options.extract_latex {
304        md = normalize_extracted_latex_markdown(&md);
305    }
306
307    Ok(EnhancedMarkdownResult {
308        markdown: md,
309        metadata: extracted_metadata,
310    })
311}
312
313/// Convert HTML to Markdown using the kreuzberg html-to-markdown library.
314///
315/// Returns a structured result with content, metadata, tables, images, and warnings.
316/// This is a high-performance alternative to `convert_html_to_markdown` using the
317/// same Rust core that powers the kreuzberg ecosystem.
318///
319/// # Arguments
320///
321/// * `html` - The HTML content to convert
322/// * `base_url` - Optional base URL for converting relative URLs to absolute
323///
324/// # Returns
325///
326/// A `KreuzbergResult` with structured conversion output
327///
328/// # Errors
329///
330/// Returns an error if conversion fails
331pub fn convert_with_kreuzberg(
332    html: &str,
333    base_url: Option<&str>,
334) -> Result<kreuzberg::KreuzbergResult> {
335    kreuzberg::convert_with_kreuzberg(html, base_url)
336}
337
338/// Convert HTML to Markdown using kreuzberg after applying enhanced scoping options.
339///
340/// This keeps the alternate converter compatible with the same `contentSelector`
341/// and `bodySelector` controls used by the default enhanced converter.
342///
343/// # Errors
344///
345/// Returns an error if conversion fails.
346pub fn convert_with_kreuzberg_enhanced(
347    html: &str,
348    base_url: Option<&str>,
349    options: &EnhancedOptions,
350) -> Result<kreuzberg::KreuzbergResult> {
351    let scoped_html = scope_html_with_selectors(html, options);
352    kreuzberg::convert_with_kreuzberg(&scoped_html, base_url)
353}
354
355fn normalize_extracted_latex_markdown(markdown: &str) -> String {
356    let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
357    re.replace_all(markdown, |caps: &regex::Captures<'_>| {
358        let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
359        format!("${formula}$")
360    })
361    .into_owned()
362}
363
364fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
365    if let Some(body_selector) = options.body_selector.as_deref() {
366        let body_html = markdown::select_html(html, body_selector);
367        let title_selector = options
368            .content_selector
369            .as_deref()
370            .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
371        let title_html = markdown::select_html(html, &title_selector);
372        return match (title_html, body_html) {
373            (Some(title), Some(body)) => format!("{title}\n{body}"),
374            (None, Some(body)) => body,
375            _ => html.to_string(),
376        };
377    }
378
379    options
380        .content_selector
381        .as_deref()
382        .and_then(|selector| markdown::select_html(html, selector))
383        .unwrap_or_else(|| html.to_string())
384}
385
386fn replace_latex_formula_elements(html: &str) -> String {
387    let mut result = html.to_string();
388
389    let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
390    result = img_formula_re
391        .replace_all(&result, |caps: &regex::Captures<'_>| {
392            let tag = caps.get(0).map_or("", |m| m.as_str());
393            if is_formula_img_tag(tag) {
394                extract_attr(tag, "source")
395                    .or_else(|| extract_attr(tag, "alt"))
396                    .map_or_else(
397                        || tag.to_string(),
398                        |latex| format!("${}$", normalize_latex_for_html(&latex)),
399                    )
400            } else {
401                tag.to_string()
402            }
403        })
404        .into_owned();
405
406    let math_attr_re = regex::Regex::new(
407        r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
408    )
409    .expect("valid regex");
410    math_attr_re
411        .replace_all(&result, |caps: &regex::Captures<'_>| {
412            let full = caps.get(0).map_or("", |m| m.as_str());
413            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
414            let tag = caps
415                .name("tag")
416                .map_or("", |m| m.as_str())
417                .to_ascii_lowercase();
418            let tag_close = caps
419                .name("tag_close")
420                .map_or("", |m| m.as_str())
421                .to_ascii_lowercase();
422
423            if tag != tag_close || !is_math_attrs(&tag, attrs) {
424                return full.to_string();
425            }
426
427            extract_attr(attrs, "data-tex")
428                .or_else(|| extract_attr(attrs, "data-latex"))
429                .or_else(|| extract_annotation_tex(full))
430                .map_or_else(
431                    || full.to_string(),
432                    |latex| format!("${}$", normalize_latex_for_html(&latex)),
433                )
434        })
435        .into_owned()
436}
437
438fn correct_code_languages(html: &str) -> String {
439    let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
440        .expect("valid regex");
441
442    code_re
443        .replace_all(html, |caps: &regex::Captures<'_>| {
444            let full = caps.get(0).map_or("", |m| m.as_str());
445            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
446            let body = caps.name("body").map_or("", |m| m.as_str());
447
448            if !has_matlab_language(attrs) || !looks_like_coq(body) {
449                return full.to_string();
450            }
451
452            let updated_attrs = attrs
453                .replace("language-matlab", "language-coq")
454                .replace(r#"class="matlab""#, r#"class="coq""#)
455                .replace("class='matlab'", "class='coq'");
456
457            format!("<code{updated_attrs}>{body}</code>")
458        })
459        .into_owned()
460}
461
462fn is_formula_img_tag(tag: &str) -> bool {
463    extract_attr(tag, "source").is_some()
464        || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
465}
466
467fn is_math_attrs(tag: &str, attrs: &str) -> bool {
468    tag == "mjx-container"
469        || extract_attr(attrs, "class").is_some_and(|classes| {
470            classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
471        })
472}
473
474fn has_matlab_language(attrs: &str) -> bool {
475    extract_attr(attrs, "class").is_some_and(|classes| {
476        classes
477            .split_whitespace()
478            .any(|class| class == "language-matlab" || class == "matlab")
479    })
480}
481
482fn looks_like_coq(text: &str) -> bool {
483    let decoded = crate::html::decode_html_entities(text);
484    [
485        "Require Import",
486        "Definition",
487        "Fixpoint",
488        "Lemma",
489        "Theorem",
490        "Proof",
491        "Qed",
492        "Notation",
493        "Inductive",
494    ]
495    .iter()
496    .any(|needle| decoded.contains(needle))
497}
498
499fn normalize_latex_for_html(latex: &str) -> String {
500    latex.trim().replace('\\', "&#92;")
501}
502
503fn extract_annotation_tex(html: &str) -> Option<String> {
504    let re = regex::Regex::new(
505        r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
506    )
507    .ok()?;
508
509    re.captures(html).and_then(|caps| {
510        let text = caps.get(1)?.as_str().trim();
511        (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
512    })
513}
514
515fn extract_attr(tag: &str, attr: &str) -> Option<String> {
516    let re = regex::Regex::new(&format!(
517        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
518        regex::escape(attr)
519    ))
520    .ok()?;
521
522    re.captures(tag).and_then(|caps| {
523        let value = caps
524            .get(1)
525            .or_else(|| caps.get(2))
526            .or_else(|| caps.get(3))?
527            .as_str()
528            .trim();
529        (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
530    })
531}
532
533// Re-export commonly used types
534pub use browser::BrowserEngine;
535pub use search::{
536    search, SearchDiagnostics, SearchResult, SearchResultItem, DEFAULT_LIMIT, DEFAULT_PROVIDER,
537    SEARCH_PROVIDERS,
538};