Skip to main content

web_capture/
lib.rs

1//! # web-capture
2//!
3//! A library and CLI/microservice to render web pages as HTML, Markdown, or PNG screenshots.
4//!
5//! ## Features
6//!
7//! - Fetch HTML content from URLs
8//! - Convert HTML to Markdown
9//! - Capture PNG screenshots of web pages
10//! - Convert relative URLs to absolute URLs
11//! - Support for headless browser rendering via browser-commander
12//!
13//! ## Example
14//!
15//! ```rust,no_run
16//! use web_capture::{fetch_html, convert_html_to_markdown, capture_screenshot};
17//!
18//! #[tokio::main]
19//! async fn main() -> anyhow::Result<()> {
20//!     // Fetch HTML from a URL
21//!     let html = fetch_html("https://example.com").await?;
22//!     println!("HTML length: {}", html.len());
23//!
24//!     // Convert HTML to Markdown
25//!     let markdown = convert_html_to_markdown(&html, Some("https://example.com"))?;
26//!     println!("Markdown: {}", markdown);
27//!
28//!     // Capture a screenshot
29//!     let screenshot = capture_screenshot("https://example.com").await?;
30//!     println!("Screenshot size: {} bytes", screenshot.len());
31//!
32//!     Ok(())
33//! }
34//! ```
35
36pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod github;
44pub mod html;
45pub mod kreuzberg;
46pub mod latex;
47pub mod localize_images;
48pub mod markdown;
49pub mod metadata;
50pub mod postprocess;
51pub mod search;
52pub mod themed_image;
53pub mod verify;
54pub mod xpaste;
55
56use thiserror::Error;
57
58/// Version of the web-capture library
59pub const VERSION: &str = env!("CARGO_PKG_VERSION");
60
61/// Error types for web-capture operations
62#[derive(Error, Debug)]
63pub enum WebCaptureError {
64    #[error("Failed to fetch URL: {0}")]
65    FetchError(String),
66
67    #[error("Failed to parse HTML: {0}")]
68    ParseError(String),
69
70    #[error("Failed to convert to Markdown: {0}")]
71    MarkdownError(String),
72
73    #[error("Failed to capture screenshot: {0}")]
74    ScreenshotError(String),
75
76    #[error("Browser error: {0}")]
77    BrowserError(String),
78
79    #[error("Invalid URL: {0}")]
80    InvalidUrl(String),
81
82    #[error("IO error: {0}")]
83    IoError(#[from] std::io::Error),
84
85    #[error("Request error: {0}")]
86    RequestError(#[from] reqwest::Error),
87}
88
89/// Result type for web-capture operations
90pub type Result<T> = std::result::Result<T, WebCaptureError>;
91
92/// Fetch HTML content from a URL
93///
94/// This function makes a simple HTTP GET request to fetch the HTML content.
95/// For JavaScript-heavy pages, use `render_html` instead.
96///
97/// # Arguments
98///
99/// * `url` - The URL to fetch
100///
101/// # Returns
102///
103/// The HTML content as a string
104///
105/// # Errors
106///
107/// Returns an error if the fetch fails or the response cannot be decoded
108pub async fn fetch_html(url: &str) -> Result<String> {
109    html::fetch_html(url).await
110}
111
112/// Render HTML content from a URL using a headless browser
113///
114/// This function uses browser-commander to launch a headless browser,
115/// navigate to the URL, and return the rendered HTML content.
116///
117/// # Arguments
118///
119/// * `url` - The URL to render
120///
121/// # Returns
122///
123/// The rendered HTML content as a string
124///
125/// # Errors
126///
127/// Returns an error if browser operations fail
128pub async fn render_html(url: &str) -> Result<String> {
129    browser::render_html(url).await
130}
131
132/// Convert HTML content to Markdown
133///
134/// # Arguments
135///
136/// * `html` - The HTML content to convert
137/// * `base_url` - Optional base URL for converting relative URLs to absolute
138///
139/// # Returns
140///
141/// The Markdown content as a string
142///
143/// # Errors
144///
145/// Returns an error if conversion fails
146pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
147    markdown::convert_html_to_markdown(html, base_url)
148}
149
150/// Capture a PNG screenshot of a URL
151///
152/// This function uses browser-commander to launch a headless browser,
153/// navigate to the URL, and capture a screenshot.
154///
155/// # Arguments
156///
157/// * `url` - The URL to capture
158///
159/// # Returns
160///
161/// The PNG image data as bytes
162///
163/// # Errors
164///
165/// Returns an error if browser operations fail
166pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
167    browser::capture_screenshot(url).await
168}
169
170/// Convert relative URLs to absolute URLs in HTML content
171///
172/// # Arguments
173///
174/// * `html` - The HTML content to process
175/// * `base_url` - The base URL to use for resolving relative URLs
176///
177/// # Returns
178///
179/// The HTML content with absolute URLs
180#[must_use]
181pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
182    html::convert_relative_urls(html, base_url)
183}
184
185/// Convert HTML content to UTF-8 encoding
186///
187/// Detects the current encoding from meta tags and converts to UTF-8 if needed.
188///
189/// # Arguments
190///
191/// * `html` - The HTML content to convert
192///
193/// # Returns
194///
195/// The UTF-8 encoded HTML content
196#[must_use]
197pub fn convert_to_utf8(html: &str) -> String {
198    html::convert_to_utf8(html)
199}
200
201/// Options for enhanced HTML-to-Markdown conversion.
202#[allow(clippy::struct_excessive_bools)]
203#[derive(Debug, Clone)]
204pub struct EnhancedOptions {
205    /// Extract LaTeX formulas from img.formula, `KaTeX`, `MathJax` elements.
206    pub extract_latex: bool,
207    /// Extract article metadata (author, date, hubs, tags).
208    pub extract_metadata: bool,
209    /// Apply post-processing (unicode normalization, LaTeX spacing, etc.).
210    pub post_process: bool,
211    /// Detect and correct code block languages.
212    pub detect_code_language: bool,
213    /// CSS selector used to scope Markdown conversion.
214    pub content_selector: Option<String>,
215    /// CSS selector for article body Markdown; prepends the selected article title when available.
216    pub body_selector: Option<String>,
217}
218
219impl Default for EnhancedOptions {
220    fn default() -> Self {
221        Self {
222            extract_latex: true,
223            extract_metadata: true,
224            post_process: true,
225            detect_code_language: true,
226            content_selector: None,
227            body_selector: None,
228        }
229    }
230}
231
232/// Result of enhanced HTML-to-Markdown conversion.
233#[derive(Debug, Clone)]
234pub struct EnhancedMarkdownResult {
235    pub markdown: String,
236    pub metadata: Option<metadata::ArticleMetadata>,
237}
238
239/// Convert HTML to Markdown with enhanced options.
240///
241/// Supports LaTeX formula extraction, metadata extraction, and
242/// post-processing pipeline matching the JavaScript implementation.
243///
244/// # Arguments
245///
246/// * `html` - The HTML content to convert
247/// * `base_url` - Optional base URL for resolving relative URLs
248/// * `options` - Enhanced conversion options
249///
250/// # Returns
251///
252/// Enhanced result with markdown text and optional metadata
253///
254/// # Errors
255///
256/// Returns an error if base conversion fails
257pub fn convert_html_to_markdown_enhanced(
258    html: &str,
259    base_url: Option<&str>,
260    options: &EnhancedOptions,
261) -> Result<EnhancedMarkdownResult> {
262    let mut html_for_markdown = scope_html_with_selectors(html, options);
263
264    if options.extract_latex {
265        html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
266    }
267
268    if options.detect_code_language {
269        html_for_markdown = correct_code_languages(&html_for_markdown);
270    }
271
272    // Start with basic markdown conversion
273    let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
274
275    // Extract metadata if requested
276    let extracted_metadata = if options.extract_metadata {
277        let meta = metadata::extract_metadata(html);
278        // Prepend metadata block
279        let header_lines = metadata::format_metadata_block(&meta);
280        if !header_lines.is_empty() {
281            let header = header_lines.join("\n");
282            // Insert after the first heading
283            if let Some(pos) = md.find("\n\n") {
284                md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
285            } else {
286                md = format!("{header}\n\n{md}");
287            }
288        }
289        // Append footer block
290        let footer_lines = metadata::format_footer_block(&meta);
291        if !footer_lines.is_empty() {
292            md.push_str("\n\n");
293            md.push_str(&footer_lines.join("\n"));
294        }
295        Some(meta)
296    } else {
297        None
298    };
299
300    // Apply post-processing if requested
301    if options.post_process {
302        md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
303    }
304
305    if options.extract_latex {
306        md = normalize_extracted_latex_markdown(&md);
307    }
308
309    Ok(EnhancedMarkdownResult {
310        markdown: md,
311        metadata: extracted_metadata,
312    })
313}
314
315/// Convert HTML to Markdown using the kreuzberg html-to-markdown library.
316///
317/// Returns a structured result with content, metadata, tables, images, and warnings.
318/// This is a high-performance alternative to `convert_html_to_markdown` using the
319/// same Rust core that powers the kreuzberg ecosystem.
320///
321/// # Arguments
322///
323/// * `html` - The HTML content to convert
324/// * `base_url` - Optional base URL for converting relative URLs to absolute
325///
326/// # Returns
327///
328/// A `KreuzbergResult` with structured conversion output
329///
330/// # Errors
331///
332/// Returns an error if conversion fails
333pub fn convert_with_kreuzberg(
334    html: &str,
335    base_url: Option<&str>,
336) -> Result<kreuzberg::KreuzbergResult> {
337    kreuzberg::convert_with_kreuzberg(html, base_url)
338}
339
340/// Convert HTML to Markdown using kreuzberg after applying enhanced scoping options.
341///
342/// This keeps the alternate converter compatible with the same `contentSelector`
343/// and `bodySelector` controls used by the default enhanced converter.
344///
345/// # Errors
346///
347/// Returns an error if conversion fails.
348pub fn convert_with_kreuzberg_enhanced(
349    html: &str,
350    base_url: Option<&str>,
351    options: &EnhancedOptions,
352) -> Result<kreuzberg::KreuzbergResult> {
353    let scoped_html = scope_html_with_selectors(html, options);
354    kreuzberg::convert_with_kreuzberg(&scoped_html, base_url)
355}
356
357fn normalize_extracted_latex_markdown(markdown: &str) -> String {
358    let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
359    re.replace_all(markdown, |caps: &regex::Captures<'_>| {
360        let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
361        format!("${formula}$")
362    })
363    .into_owned()
364}
365
366fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
367    if let Some(body_selector) = options.body_selector.as_deref() {
368        let body_html = markdown::select_html(html, body_selector);
369        let title_selector = options
370            .content_selector
371            .as_deref()
372            .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
373        let title_html = markdown::select_html(html, &title_selector);
374        return match (title_html, body_html) {
375            (Some(title), Some(body)) => format!("{title}\n{body}"),
376            (None, Some(body)) => body,
377            _ => html.to_string(),
378        };
379    }
380
381    options
382        .content_selector
383        .as_deref()
384        .and_then(|selector| markdown::select_html(html, selector))
385        .unwrap_or_else(|| html.to_string())
386}
387
388fn replace_latex_formula_elements(html: &str) -> String {
389    let mut result = html.to_string();
390
391    let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
392    result = img_formula_re
393        .replace_all(&result, |caps: &regex::Captures<'_>| {
394            let tag = caps.get(0).map_or("", |m| m.as_str());
395            if is_formula_img_tag(tag) {
396                extract_attr(tag, "source")
397                    .or_else(|| extract_attr(tag, "alt"))
398                    .map_or_else(
399                        || tag.to_string(),
400                        |latex| format!("${}$", normalize_latex_for_html(&latex)),
401                    )
402            } else {
403                tag.to_string()
404            }
405        })
406        .into_owned();
407
408    let math_attr_re = regex::Regex::new(
409        r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
410    )
411    .expect("valid regex");
412    math_attr_re
413        .replace_all(&result, |caps: &regex::Captures<'_>| {
414            let full = caps.get(0).map_or("", |m| m.as_str());
415            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
416            let tag = caps
417                .name("tag")
418                .map_or("", |m| m.as_str())
419                .to_ascii_lowercase();
420            let tag_close = caps
421                .name("tag_close")
422                .map_or("", |m| m.as_str())
423                .to_ascii_lowercase();
424
425            if tag != tag_close || !is_math_attrs(&tag, attrs) {
426                return full.to_string();
427            }
428
429            extract_attr(attrs, "data-tex")
430                .or_else(|| extract_attr(attrs, "data-latex"))
431                .or_else(|| extract_annotation_tex(full))
432                .map_or_else(
433                    || full.to_string(),
434                    |latex| format!("${}$", normalize_latex_for_html(&latex)),
435                )
436        })
437        .into_owned()
438}
439
440fn correct_code_languages(html: &str) -> String {
441    let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
442        .expect("valid regex");
443
444    code_re
445        .replace_all(html, |caps: &regex::Captures<'_>| {
446            let full = caps.get(0).map_or("", |m| m.as_str());
447            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
448            let body = caps.name("body").map_or("", |m| m.as_str());
449
450            if !has_matlab_language(attrs) || !looks_like_coq(body) {
451                return full.to_string();
452            }
453
454            let updated_attrs = attrs
455                .replace("language-matlab", "language-coq")
456                .replace(r#"class="matlab""#, r#"class="coq""#)
457                .replace("class='matlab'", "class='coq'");
458
459            format!("<code{updated_attrs}>{body}</code>")
460        })
461        .into_owned()
462}
463
464fn is_formula_img_tag(tag: &str) -> bool {
465    extract_attr(tag, "source").is_some()
466        || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
467}
468
469fn is_math_attrs(tag: &str, attrs: &str) -> bool {
470    tag == "mjx-container"
471        || extract_attr(attrs, "class").is_some_and(|classes| {
472            classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
473        })
474}
475
476fn has_matlab_language(attrs: &str) -> bool {
477    extract_attr(attrs, "class").is_some_and(|classes| {
478        classes
479            .split_whitespace()
480            .any(|class| class == "language-matlab" || class == "matlab")
481    })
482}
483
484fn looks_like_coq(text: &str) -> bool {
485    let decoded = crate::html::decode_html_entities(text);
486    [
487        "Require Import",
488        "Definition",
489        "Fixpoint",
490        "Lemma",
491        "Theorem",
492        "Proof",
493        "Qed",
494        "Notation",
495        "Inductive",
496    ]
497    .iter()
498    .any(|needle| decoded.contains(needle))
499}
500
501fn normalize_latex_for_html(latex: &str) -> String {
502    latex.trim().replace('\\', "&#92;")
503}
504
505fn extract_annotation_tex(html: &str) -> Option<String> {
506    let re = regex::Regex::new(
507        r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
508    )
509    .ok()?;
510
511    re.captures(html).and_then(|caps| {
512        let text = caps.get(1)?.as_str().trim();
513        (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
514    })
515}
516
517fn extract_attr(tag: &str, attr: &str) -> Option<String> {
518    let re = regex::Regex::new(&format!(
519        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
520        regex::escape(attr)
521    ))
522    .ok()?;
523
524    re.captures(tag).and_then(|caps| {
525        let value = caps
526            .get(1)
527            .or_else(|| caps.get(2))
528            .or_else(|| caps.get(3))?
529            .as_str()
530            .trim();
531        (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
532    })
533}
534
535// Re-export commonly used types
536pub use browser::BrowserEngine;
537pub use search::{
538    search, SearchDiagnostics, SearchResult, SearchResultItem, DEFAULT_LIMIT, DEFAULT_PROVIDER,
539    SEARCH_PROVIDERS,
540};