Skip to main content

web_capture/
lib.rs

1//! # web-capture
2//!
3//! A library and CLI/microservice to render web pages as HTML, Markdown, or PNG screenshots.
4//!
5//! ## Features
6//!
7//! - Fetch HTML content from URLs
8//! - Convert HTML to Markdown
9//! - Capture PNG screenshots of web pages
10//! - Convert relative URLs to absolute URLs
11//! - Support for headless browser rendering via browser-commander
12//!
13//! ## Example
14//!
15//! ```rust,no_run
16//! use web_capture::{fetch_html, convert_html_to_markdown, capture_screenshot};
17//!
18//! #[tokio::main]
19//! async fn main() -> anyhow::Result<()> {
20//!     // Fetch HTML from a URL
21//!     let html = fetch_html("https://example.com").await?;
22//!     println!("HTML length: {}", html.len());
23//!
24//!     // Convert HTML to Markdown
25//!     let markdown = convert_html_to_markdown(&html, Some("https://example.com"))?;
26//!     println!("Markdown: {}", markdown);
27//!
28//!     // Capture a screenshot
29//!     let screenshot = capture_screenshot("https://example.com").await?;
30//!     println!("Screenshot size: {} bytes", screenshot.len());
31//!
32//!     Ok(())
33//! }
34//! ```
35
36pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod html;
44pub mod latex;
45pub mod localize_images;
46pub mod markdown;
47pub mod metadata;
48pub mod postprocess;
49pub mod search;
50pub mod themed_image;
51pub mod verify;
52
53use thiserror::Error;
54
55/// Version of the web-capture library
56pub const VERSION: &str = env!("CARGO_PKG_VERSION");
57
58/// Error types for web-capture operations
59#[derive(Error, Debug)]
60pub enum WebCaptureError {
61    #[error("Failed to fetch URL: {0}")]
62    FetchError(String),
63
64    #[error("Failed to parse HTML: {0}")]
65    ParseError(String),
66
67    #[error("Failed to convert to Markdown: {0}")]
68    MarkdownError(String),
69
70    #[error("Failed to capture screenshot: {0}")]
71    ScreenshotError(String),
72
73    #[error("Browser error: {0}")]
74    BrowserError(String),
75
76    #[error("Invalid URL: {0}")]
77    InvalidUrl(String),
78
79    #[error("IO error: {0}")]
80    IoError(#[from] std::io::Error),
81
82    #[error("Request error: {0}")]
83    RequestError(#[from] reqwest::Error),
84}
85
86/// Result type for web-capture operations
87pub type Result<T> = std::result::Result<T, WebCaptureError>;
88
89/// Fetch HTML content from a URL
90///
91/// This function makes a simple HTTP GET request to fetch the HTML content.
92/// For JavaScript-heavy pages, use `render_html` instead.
93///
94/// # Arguments
95///
96/// * `url` - The URL to fetch
97///
98/// # Returns
99///
100/// The HTML content as a string
101///
102/// # Errors
103///
104/// Returns an error if the fetch fails or the response cannot be decoded
105pub async fn fetch_html(url: &str) -> Result<String> {
106    html::fetch_html(url).await
107}
108
109/// Render HTML content from a URL using a headless browser
110///
111/// This function uses browser-commander to launch a headless browser,
112/// navigate to the URL, and return the rendered HTML content.
113///
114/// # Arguments
115///
116/// * `url` - The URL to render
117///
118/// # Returns
119///
120/// The rendered HTML content as a string
121///
122/// # Errors
123///
124/// Returns an error if browser operations fail
125pub async fn render_html(url: &str) -> Result<String> {
126    browser::render_html(url).await
127}
128
129/// Convert HTML content to Markdown
130///
131/// # Arguments
132///
133/// * `html` - The HTML content to convert
134/// * `base_url` - Optional base URL for converting relative URLs to absolute
135///
136/// # Returns
137///
138/// The Markdown content as a string
139///
140/// # Errors
141///
142/// Returns an error if conversion fails
143pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
144    markdown::convert_html_to_markdown(html, base_url)
145}
146
147/// Capture a PNG screenshot of a URL
148///
149/// This function uses browser-commander to launch a headless browser,
150/// navigate to the URL, and capture a screenshot.
151///
152/// # Arguments
153///
154/// * `url` - The URL to capture
155///
156/// # Returns
157///
158/// The PNG image data as bytes
159///
160/// # Errors
161///
162/// Returns an error if browser operations fail
163pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
164    browser::capture_screenshot(url).await
165}
166
167/// Convert relative URLs to absolute URLs in HTML content
168///
169/// # Arguments
170///
171/// * `html` - The HTML content to process
172/// * `base_url` - The base URL to use for resolving relative URLs
173///
174/// # Returns
175///
176/// The HTML content with absolute URLs
177#[must_use]
178pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
179    html::convert_relative_urls(html, base_url)
180}
181
182/// Convert HTML content to UTF-8 encoding
183///
184/// Detects the current encoding from meta tags and converts to UTF-8 if needed.
185///
186/// # Arguments
187///
188/// * `html` - The HTML content to convert
189///
190/// # Returns
191///
192/// The UTF-8 encoded HTML content
193#[must_use]
194pub fn convert_to_utf8(html: &str) -> String {
195    html::convert_to_utf8(html)
196}
197
198/// Options for enhanced HTML-to-Markdown conversion.
199#[allow(clippy::struct_excessive_bools)]
200#[derive(Debug, Clone)]
201pub struct EnhancedOptions {
202    /// Extract LaTeX formulas from img.formula, `KaTeX`, `MathJax` elements.
203    pub extract_latex: bool,
204    /// Extract article metadata (author, date, hubs, tags).
205    pub extract_metadata: bool,
206    /// Apply post-processing (unicode normalization, LaTeX spacing, etc.).
207    pub post_process: bool,
208    /// Detect and correct code block languages.
209    pub detect_code_language: bool,
210    /// CSS selector used to scope Markdown conversion.
211    pub content_selector: Option<String>,
212    /// CSS selector for article body Markdown; prepends the selected article title when available.
213    pub body_selector: Option<String>,
214}
215
216impl Default for EnhancedOptions {
217    fn default() -> Self {
218        Self {
219            extract_latex: true,
220            extract_metadata: true,
221            post_process: true,
222            detect_code_language: true,
223            content_selector: None,
224            body_selector: None,
225        }
226    }
227}
228
229/// Result of enhanced HTML-to-Markdown conversion.
230#[derive(Debug, Clone)]
231pub struct EnhancedMarkdownResult {
232    pub markdown: String,
233    pub metadata: Option<metadata::ArticleMetadata>,
234}
235
236/// Convert HTML to Markdown with enhanced options.
237///
238/// Supports LaTeX formula extraction, metadata extraction, and
239/// post-processing pipeline matching the JavaScript implementation.
240///
241/// # Arguments
242///
243/// * `html` - The HTML content to convert
244/// * `base_url` - Optional base URL for resolving relative URLs
245/// * `options` - Enhanced conversion options
246///
247/// # Returns
248///
249/// Enhanced result with markdown text and optional metadata
250///
251/// # Errors
252///
253/// Returns an error if base conversion fails
254pub fn convert_html_to_markdown_enhanced(
255    html: &str,
256    base_url: Option<&str>,
257    options: &EnhancedOptions,
258) -> Result<EnhancedMarkdownResult> {
259    let mut html_for_markdown = scope_html_with_selectors(html, options);
260
261    if options.extract_latex {
262        html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
263    }
264
265    if options.detect_code_language {
266        html_for_markdown = correct_code_languages(&html_for_markdown);
267    }
268
269    // Start with basic markdown conversion
270    let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
271
272    // Extract metadata if requested
273    let extracted_metadata = if options.extract_metadata {
274        let meta = metadata::extract_metadata(html);
275        // Prepend metadata block
276        let header_lines = metadata::format_metadata_block(&meta);
277        if !header_lines.is_empty() {
278            let header = header_lines.join("\n");
279            // Insert after the first heading
280            if let Some(pos) = md.find("\n\n") {
281                md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
282            } else {
283                md = format!("{header}\n\n{md}");
284            }
285        }
286        // Append footer block
287        let footer_lines = metadata::format_footer_block(&meta);
288        if !footer_lines.is_empty() {
289            md.push_str("\n\n");
290            md.push_str(&footer_lines.join("\n"));
291        }
292        Some(meta)
293    } else {
294        None
295    };
296
297    // Apply post-processing if requested
298    if options.post_process {
299        md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
300    }
301
302    if options.extract_latex {
303        md = normalize_extracted_latex_markdown(&md);
304    }
305
306    Ok(EnhancedMarkdownResult {
307        markdown: md,
308        metadata: extracted_metadata,
309    })
310}
311
312fn normalize_extracted_latex_markdown(markdown: &str) -> String {
313    let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
314    re.replace_all(markdown, |caps: &regex::Captures<'_>| {
315        let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
316        format!("${formula}$")
317    })
318    .into_owned()
319}
320
321fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
322    if let Some(body_selector) = options.body_selector.as_deref() {
323        let body_html = markdown::select_html(html, body_selector);
324        let title_selector = options
325            .content_selector
326            .as_deref()
327            .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
328        let title_html = markdown::select_html(html, &title_selector);
329        return match (title_html, body_html) {
330            (Some(title), Some(body)) => format!("{title}\n{body}"),
331            (None, Some(body)) => body,
332            _ => html.to_string(),
333        };
334    }
335
336    options
337        .content_selector
338        .as_deref()
339        .and_then(|selector| markdown::select_html(html, selector))
340        .unwrap_or_else(|| html.to_string())
341}
342
343fn replace_latex_formula_elements(html: &str) -> String {
344    let mut result = html.to_string();
345
346    let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
347    result = img_formula_re
348        .replace_all(&result, |caps: &regex::Captures<'_>| {
349            let tag = caps.get(0).map_or("", |m| m.as_str());
350            if is_formula_img_tag(tag) {
351                extract_attr(tag, "source")
352                    .or_else(|| extract_attr(tag, "alt"))
353                    .map_or_else(
354                        || tag.to_string(),
355                        |latex| format!("${}$", normalize_latex_for_html(&latex)),
356                    )
357            } else {
358                tag.to_string()
359            }
360        })
361        .into_owned();
362
363    let math_attr_re = regex::Regex::new(
364        r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
365    )
366    .expect("valid regex");
367    math_attr_re
368        .replace_all(&result, |caps: &regex::Captures<'_>| {
369            let full = caps.get(0).map_or("", |m| m.as_str());
370            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
371            let tag = caps
372                .name("tag")
373                .map_or("", |m| m.as_str())
374                .to_ascii_lowercase();
375            let tag_close = caps
376                .name("tag_close")
377                .map_or("", |m| m.as_str())
378                .to_ascii_lowercase();
379
380            if tag != tag_close || !is_math_attrs(&tag, attrs) {
381                return full.to_string();
382            }
383
384            extract_attr(attrs, "data-tex")
385                .or_else(|| extract_attr(attrs, "data-latex"))
386                .or_else(|| extract_annotation_tex(full))
387                .map_or_else(
388                    || full.to_string(),
389                    |latex| format!("${}$", normalize_latex_for_html(&latex)),
390                )
391        })
392        .into_owned()
393}
394
395fn correct_code_languages(html: &str) -> String {
396    let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
397        .expect("valid regex");
398
399    code_re
400        .replace_all(html, |caps: &regex::Captures<'_>| {
401            let full = caps.get(0).map_or("", |m| m.as_str());
402            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
403            let body = caps.name("body").map_or("", |m| m.as_str());
404
405            if !has_matlab_language(attrs) || !looks_like_coq(body) {
406                return full.to_string();
407            }
408
409            let updated_attrs = attrs
410                .replace("language-matlab", "language-coq")
411                .replace(r#"class="matlab""#, r#"class="coq""#)
412                .replace("class='matlab'", "class='coq'");
413
414            format!("<code{updated_attrs}>{body}</code>")
415        })
416        .into_owned()
417}
418
419fn is_formula_img_tag(tag: &str) -> bool {
420    extract_attr(tag, "source").is_some()
421        || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
422}
423
424fn is_math_attrs(tag: &str, attrs: &str) -> bool {
425    tag == "mjx-container"
426        || extract_attr(attrs, "class").is_some_and(|classes| {
427            classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
428        })
429}
430
431fn has_matlab_language(attrs: &str) -> bool {
432    extract_attr(attrs, "class").is_some_and(|classes| {
433        classes
434            .split_whitespace()
435            .any(|class| class == "language-matlab" || class == "matlab")
436    })
437}
438
439fn looks_like_coq(text: &str) -> bool {
440    let decoded = crate::html::decode_html_entities(text);
441    [
442        "Require Import",
443        "Definition",
444        "Fixpoint",
445        "Lemma",
446        "Theorem",
447        "Proof",
448        "Qed",
449        "Notation",
450        "Inductive",
451    ]
452    .iter()
453    .any(|needle| decoded.contains(needle))
454}
455
456fn normalize_latex_for_html(latex: &str) -> String {
457    latex.trim().replace('\\', "&#92;")
458}
459
460fn extract_annotation_tex(html: &str) -> Option<String> {
461    let re = regex::Regex::new(
462        r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
463    )
464    .ok()?;
465
466    re.captures(html).and_then(|caps| {
467        let text = caps.get(1)?.as_str().trim();
468        (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
469    })
470}
471
472fn extract_attr(tag: &str, attr: &str) -> Option<String> {
473    let re = regex::Regex::new(&format!(
474        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
475        regex::escape(attr)
476    ))
477    .ok()?;
478
479    re.captures(tag).and_then(|caps| {
480        let value = caps
481            .get(1)
482            .or_else(|| caps.get(2))
483            .or_else(|| caps.get(3))?
484            .as_str()
485            .trim();
486        (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
487    })
488}
489
490// Re-export commonly used types
491pub use browser::BrowserEngine;
492pub use search::{
493    search, SearchDiagnostics, SearchResult, SearchResultItem, DEFAULT_LIMIT, DEFAULT_PROVIDER,
494    SEARCH_PROVIDERS,
495};