Skip to main content

web_capture/
lib.rs

1//! # web-capture
2//!
3//! A library and CLI/microservice to render web pages as HTML, Markdown, or PNG screenshots.
4//!
5//! ## Features
6//!
7//! - Fetch HTML content from URLs
8//! - Convert HTML to Markdown
9//! - Capture PNG screenshots of web pages
10//! - Convert relative URLs to absolute URLs
11//! - Support for headless browser rendering via browser-commander
12//!
13//! ## Example
14//!
15//! ```rust,no_run
16//! use web_capture::{fetch_html, convert_html_to_markdown, capture_screenshot};
17//!
18//! #[tokio::main]
19//! async fn main() -> anyhow::Result<()> {
20//!     // Fetch HTML from a URL
21//!     let html = fetch_html("https://example.com").await?;
22//!     println!("HTML length: {}", html.len());
23//!
24//!     // Convert HTML to Markdown
25//!     let markdown = convert_html_to_markdown(&html, Some("https://example.com"))?;
26//!     println!("Markdown: {}", markdown);
27//!
28//!     // Capture a screenshot
29//!     let screenshot = capture_screenshot("https://example.com").await?;
30//!     println!("Screenshot size: {} bytes", screenshot.len());
31//!
32//!     Ok(())
33//! }
34//! ```
35
36pub mod animation;
37pub mod batch;
38pub mod browser;
39pub mod extract_images;
40pub mod figures;
41pub mod gdocs;
42pub mod html;
43pub mod latex;
44pub mod localize_images;
45pub mod markdown;
46pub mod metadata;
47pub mod postprocess;
48pub mod themed_image;
49pub mod verify;
50
51use thiserror::Error;
52
53/// Version of the web-capture library
54pub const VERSION: &str = env!("CARGO_PKG_VERSION");
55
56/// Error types for web-capture operations
57#[derive(Error, Debug)]
58pub enum WebCaptureError {
59    #[error("Failed to fetch URL: {0}")]
60    FetchError(String),
61
62    #[error("Failed to parse HTML: {0}")]
63    ParseError(String),
64
65    #[error("Failed to convert to Markdown: {0}")]
66    MarkdownError(String),
67
68    #[error("Failed to capture screenshot: {0}")]
69    ScreenshotError(String),
70
71    #[error("Browser error: {0}")]
72    BrowserError(String),
73
74    #[error("Invalid URL: {0}")]
75    InvalidUrl(String),
76
77    #[error("IO error: {0}")]
78    IoError(#[from] std::io::Error),
79
80    #[error("Request error: {0}")]
81    RequestError(#[from] reqwest::Error),
82}
83
84/// Result type for web-capture operations
85pub type Result<T> = std::result::Result<T, WebCaptureError>;
86
87/// Fetch HTML content from a URL
88///
89/// This function makes a simple HTTP GET request to fetch the HTML content.
90/// For JavaScript-heavy pages, use `render_html` instead.
91///
92/// # Arguments
93///
94/// * `url` - The URL to fetch
95///
96/// # Returns
97///
98/// The HTML content as a string
99///
100/// # Errors
101///
102/// Returns an error if the fetch fails or the response cannot be decoded
103pub async fn fetch_html(url: &str) -> Result<String> {
104    html::fetch_html(url).await
105}
106
107/// Render HTML content from a URL using a headless browser
108///
109/// This function uses browser-commander to launch a headless browser,
110/// navigate to the URL, and return the rendered HTML content.
111///
112/// # Arguments
113///
114/// * `url` - The URL to render
115///
116/// # Returns
117///
118/// The rendered HTML content as a string
119///
120/// # Errors
121///
122/// Returns an error if browser operations fail
123pub async fn render_html(url: &str) -> Result<String> {
124    browser::render_html(url).await
125}
126
127/// Convert HTML content to Markdown
128///
129/// # Arguments
130///
131/// * `html` - The HTML content to convert
132/// * `base_url` - Optional base URL for converting relative URLs to absolute
133///
134/// # Returns
135///
136/// The Markdown content as a string
137///
138/// # Errors
139///
140/// Returns an error if conversion fails
141pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
142    markdown::convert_html_to_markdown(html, base_url)
143}
144
145/// Capture a PNG screenshot of a URL
146///
147/// This function uses browser-commander to launch a headless browser,
148/// navigate to the URL, and capture a screenshot.
149///
150/// # Arguments
151///
152/// * `url` - The URL to capture
153///
154/// # Returns
155///
156/// The PNG image data as bytes
157///
158/// # Errors
159///
160/// Returns an error if browser operations fail
161pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
162    browser::capture_screenshot(url).await
163}
164
165/// Convert relative URLs to absolute URLs in HTML content
166///
167/// # Arguments
168///
169/// * `html` - The HTML content to process
170/// * `base_url` - The base URL to use for resolving relative URLs
171///
172/// # Returns
173///
174/// The HTML content with absolute URLs
175#[must_use]
176pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
177    html::convert_relative_urls(html, base_url)
178}
179
180/// Convert HTML content to UTF-8 encoding
181///
182/// Detects the current encoding from meta tags and converts to UTF-8 if needed.
183///
184/// # Arguments
185///
186/// * `html` - The HTML content to convert
187///
188/// # Returns
189///
190/// The UTF-8 encoded HTML content
191#[must_use]
192pub fn convert_to_utf8(html: &str) -> String {
193    html::convert_to_utf8(html)
194}
195
196/// Options for enhanced HTML-to-Markdown conversion.
197#[allow(clippy::struct_excessive_bools)]
198#[derive(Debug, Clone)]
199pub struct EnhancedOptions {
200    /// Extract LaTeX formulas from img.formula, `KaTeX`, `MathJax` elements.
201    pub extract_latex: bool,
202    /// Extract article metadata (author, date, hubs, tags).
203    pub extract_metadata: bool,
204    /// Apply post-processing (unicode normalization, LaTeX spacing, etc.).
205    pub post_process: bool,
206    /// Detect and correct code block languages.
207    pub detect_code_language: bool,
208    /// CSS selector used to scope Markdown conversion.
209    pub content_selector: Option<String>,
210    /// CSS selector for article body Markdown; prepends the selected article title when available.
211    pub body_selector: Option<String>,
212}
213
214impl Default for EnhancedOptions {
215    fn default() -> Self {
216        Self {
217            extract_latex: true,
218            extract_metadata: true,
219            post_process: true,
220            detect_code_language: true,
221            content_selector: None,
222            body_selector: None,
223        }
224    }
225}
226
227/// Result of enhanced HTML-to-Markdown conversion.
228#[derive(Debug, Clone)]
229pub struct EnhancedMarkdownResult {
230    pub markdown: String,
231    pub metadata: Option<metadata::ArticleMetadata>,
232}
233
234/// Convert HTML to Markdown with enhanced options.
235///
236/// Supports LaTeX formula extraction, metadata extraction, and
237/// post-processing pipeline matching the JavaScript implementation.
238///
239/// # Arguments
240///
241/// * `html` - The HTML content to convert
242/// * `base_url` - Optional base URL for resolving relative URLs
243/// * `options` - Enhanced conversion options
244///
245/// # Returns
246///
247/// Enhanced result with markdown text and optional metadata
248///
249/// # Errors
250///
251/// Returns an error if base conversion fails
252pub fn convert_html_to_markdown_enhanced(
253    html: &str,
254    base_url: Option<&str>,
255    options: &EnhancedOptions,
256) -> Result<EnhancedMarkdownResult> {
257    let mut html_for_markdown = scope_html_with_selectors(html, options);
258
259    if options.extract_latex {
260        html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
261    }
262
263    if options.detect_code_language {
264        html_for_markdown = correct_code_languages(&html_for_markdown);
265    }
266
267    // Start with basic markdown conversion
268    let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
269
270    // Extract metadata if requested
271    let extracted_metadata = if options.extract_metadata {
272        let meta = metadata::extract_metadata(html);
273        // Prepend metadata block
274        let header_lines = metadata::format_metadata_block(&meta);
275        if !header_lines.is_empty() {
276            let header = header_lines.join("\n");
277            // Insert after the first heading
278            if let Some(pos) = md.find("\n\n") {
279                md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
280            } else {
281                md = format!("{header}\n\n{md}");
282            }
283        }
284        // Append footer block
285        let footer_lines = metadata::format_footer_block(&meta);
286        if !footer_lines.is_empty() {
287            md.push_str("\n\n");
288            md.push_str(&footer_lines.join("\n"));
289        }
290        Some(meta)
291    } else {
292        None
293    };
294
295    // Apply post-processing if requested
296    if options.post_process {
297        md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
298    }
299
300    if options.extract_latex {
301        md = normalize_extracted_latex_markdown(&md);
302    }
303
304    Ok(EnhancedMarkdownResult {
305        markdown: md,
306        metadata: extracted_metadata,
307    })
308}
309
310fn normalize_extracted_latex_markdown(markdown: &str) -> String {
311    let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
312    re.replace_all(markdown, |caps: &regex::Captures<'_>| {
313        let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
314        format!("${formula}$")
315    })
316    .into_owned()
317}
318
319fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
320    if let Some(body_selector) = options.body_selector.as_deref() {
321        let body_html = markdown::select_html(html, body_selector);
322        let title_selector = options
323            .content_selector
324            .as_deref()
325            .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
326        let title_html = markdown::select_html(html, &title_selector);
327        return match (title_html, body_html) {
328            (Some(title), Some(body)) => format!("{title}\n{body}"),
329            (None, Some(body)) => body,
330            _ => html.to_string(),
331        };
332    }
333
334    options
335        .content_selector
336        .as_deref()
337        .and_then(|selector| markdown::select_html(html, selector))
338        .unwrap_or_else(|| html.to_string())
339}
340
341fn replace_latex_formula_elements(html: &str) -> String {
342    let mut result = html.to_string();
343
344    let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
345    result = img_formula_re
346        .replace_all(&result, |caps: &regex::Captures<'_>| {
347            let tag = caps.get(0).map_or("", |m| m.as_str());
348            if is_formula_img_tag(tag) {
349                extract_attr(tag, "source")
350                    .or_else(|| extract_attr(tag, "alt"))
351                    .map_or_else(
352                        || tag.to_string(),
353                        |latex| format!("${}$", normalize_latex_for_html(&latex)),
354                    )
355            } else {
356                tag.to_string()
357            }
358        })
359        .into_owned();
360
361    let math_attr_re = regex::Regex::new(
362        r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
363    )
364    .expect("valid regex");
365    math_attr_re
366        .replace_all(&result, |caps: &regex::Captures<'_>| {
367            let full = caps.get(0).map_or("", |m| m.as_str());
368            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
369            let tag = caps
370                .name("tag")
371                .map_or("", |m| m.as_str())
372                .to_ascii_lowercase();
373            let tag_close = caps
374                .name("tag_close")
375                .map_or("", |m| m.as_str())
376                .to_ascii_lowercase();
377
378            if tag != tag_close || !is_math_attrs(&tag, attrs) {
379                return full.to_string();
380            }
381
382            extract_attr(attrs, "data-tex")
383                .or_else(|| extract_attr(attrs, "data-latex"))
384                .or_else(|| extract_annotation_tex(full))
385                .map_or_else(
386                    || full.to_string(),
387                    |latex| format!("${}$", normalize_latex_for_html(&latex)),
388                )
389        })
390        .into_owned()
391}
392
393fn correct_code_languages(html: &str) -> String {
394    let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
395        .expect("valid regex");
396
397    code_re
398        .replace_all(html, |caps: &regex::Captures<'_>| {
399            let full = caps.get(0).map_or("", |m| m.as_str());
400            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
401            let body = caps.name("body").map_or("", |m| m.as_str());
402
403            if !has_matlab_language(attrs) || !looks_like_coq(body) {
404                return full.to_string();
405            }
406
407            let updated_attrs = attrs
408                .replace("language-matlab", "language-coq")
409                .replace(r#"class="matlab""#, r#"class="coq""#)
410                .replace("class='matlab'", "class='coq'");
411
412            format!("<code{updated_attrs}>{body}</code>")
413        })
414        .into_owned()
415}
416
417fn is_formula_img_tag(tag: &str) -> bool {
418    extract_attr(tag, "source").is_some()
419        || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
420}
421
422fn is_math_attrs(tag: &str, attrs: &str) -> bool {
423    tag == "mjx-container"
424        || extract_attr(attrs, "class").is_some_and(|classes| {
425            classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
426        })
427}
428
429fn has_matlab_language(attrs: &str) -> bool {
430    extract_attr(attrs, "class").is_some_and(|classes| {
431        classes
432            .split_whitespace()
433            .any(|class| class == "language-matlab" || class == "matlab")
434    })
435}
436
437fn looks_like_coq(text: &str) -> bool {
438    let decoded = crate::html::decode_html_entities(text);
439    [
440        "Require Import",
441        "Definition",
442        "Fixpoint",
443        "Lemma",
444        "Theorem",
445        "Proof",
446        "Qed",
447        "Notation",
448        "Inductive",
449    ]
450    .iter()
451    .any(|needle| decoded.contains(needle))
452}
453
454fn normalize_latex_for_html(latex: &str) -> String {
455    latex.trim().replace('\\', "&#92;")
456}
457
458fn extract_annotation_tex(html: &str) -> Option<String> {
459    let re = regex::Regex::new(
460        r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
461    )
462    .ok()?;
463
464    re.captures(html).and_then(|caps| {
465        let text = caps.get(1)?.as_str().trim();
466        (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
467    })
468}
469
470fn extract_attr(tag: &str, attr: &str) -> Option<String> {
471    let re = regex::Regex::new(&format!(
472        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
473        regex::escape(attr)
474    ))
475    .ok()?;
476
477    re.captures(tag).and_then(|caps| {
478        let value = caps
479            .get(1)
480            .or_else(|| caps.get(2))
481            .or_else(|| caps.get(3))?
482            .as_str()
483            .trim();
484        (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
485    })
486}
487
488// Re-export commonly used types
489pub use browser::BrowserEngine;