Skip to main content

web_capture/
lib.rs

1//! # web-capture
2//!
3//! A library and CLI/microservice to render web pages as HTML, Markdown, or PNG screenshots.
4//!
5//! ## Features
6//!
7//! - Fetch HTML content from URLs
8//! - Convert HTML to Markdown
9//! - Capture PNG screenshots of web pages
10//! - Convert relative URLs to absolute URLs
11//! - Support for headless browser rendering via browser-commander
12//!
13//! ## Example
14//!
15//! ```rust,no_run
16//! use web_capture::{fetch_html, convert_html_to_markdown, capture_screenshot};
17//!
18//! #[tokio::main]
19//! async fn main() -> anyhow::Result<()> {
20//!     // Fetch HTML from a URL
21//!     let html = fetch_html("https://example.com").await?;
22//!     println!("HTML length: {}", html.len());
23//!
24//!     // Convert HTML to Markdown
25//!     let markdown = convert_html_to_markdown(&html, Some("https://example.com"))?;
26//!     println!("Markdown: {}", markdown);
27//!
28//!     // Capture a screenshot
29//!     let screenshot = capture_screenshot("https://example.com").await?;
30//!     println!("Screenshot size: {} bytes", screenshot.len());
31//!
32//!     Ok(())
33//! }
34//! ```
35
36pub mod animation;
37pub mod archive;
38pub mod batch;
39pub mod browser;
40pub mod extract_images;
41pub mod figures;
42pub mod gdocs;
43pub mod html;
44pub mod latex;
45pub mod localize_images;
46pub mod markdown;
47pub mod metadata;
48pub mod postprocess;
49pub mod themed_image;
50pub mod verify;
51
52use thiserror::Error;
53
54/// Version of the web-capture library
55pub const VERSION: &str = env!("CARGO_PKG_VERSION");
56
57/// Error types for web-capture operations
58#[derive(Error, Debug)]
59pub enum WebCaptureError {
60    #[error("Failed to fetch URL: {0}")]
61    FetchError(String),
62
63    #[error("Failed to parse HTML: {0}")]
64    ParseError(String),
65
66    #[error("Failed to convert to Markdown: {0}")]
67    MarkdownError(String),
68
69    #[error("Failed to capture screenshot: {0}")]
70    ScreenshotError(String),
71
72    #[error("Browser error: {0}")]
73    BrowserError(String),
74
75    #[error("Invalid URL: {0}")]
76    InvalidUrl(String),
77
78    #[error("IO error: {0}")]
79    IoError(#[from] std::io::Error),
80
81    #[error("Request error: {0}")]
82    RequestError(#[from] reqwest::Error),
83}
84
85/// Result type for web-capture operations
86pub type Result<T> = std::result::Result<T, WebCaptureError>;
87
88/// Fetch HTML content from a URL
89///
90/// This function makes a simple HTTP GET request to fetch the HTML content.
91/// For JavaScript-heavy pages, use `render_html` instead.
92///
93/// # Arguments
94///
95/// * `url` - The URL to fetch
96///
97/// # Returns
98///
99/// The HTML content as a string
100///
101/// # Errors
102///
103/// Returns an error if the fetch fails or the response cannot be decoded
104pub async fn fetch_html(url: &str) -> Result<String> {
105    html::fetch_html(url).await
106}
107
108/// Render HTML content from a URL using a headless browser
109///
110/// This function uses browser-commander to launch a headless browser,
111/// navigate to the URL, and return the rendered HTML content.
112///
113/// # Arguments
114///
115/// * `url` - The URL to render
116///
117/// # Returns
118///
119/// The rendered HTML content as a string
120///
121/// # Errors
122///
123/// Returns an error if browser operations fail
124pub async fn render_html(url: &str) -> Result<String> {
125    browser::render_html(url).await
126}
127
128/// Convert HTML content to Markdown
129///
130/// # Arguments
131///
132/// * `html` - The HTML content to convert
133/// * `base_url` - Optional base URL for converting relative URLs to absolute
134///
135/// # Returns
136///
137/// The Markdown content as a string
138///
139/// # Errors
140///
141/// Returns an error if conversion fails
142pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
143    markdown::convert_html_to_markdown(html, base_url)
144}
145
146/// Capture a PNG screenshot of a URL
147///
148/// This function uses browser-commander to launch a headless browser,
149/// navigate to the URL, and capture a screenshot.
150///
151/// # Arguments
152///
153/// * `url` - The URL to capture
154///
155/// # Returns
156///
157/// The PNG image data as bytes
158///
159/// # Errors
160///
161/// Returns an error if browser operations fail
162pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
163    browser::capture_screenshot(url).await
164}
165
166/// Convert relative URLs to absolute URLs in HTML content
167///
168/// # Arguments
169///
170/// * `html` - The HTML content to process
171/// * `base_url` - The base URL to use for resolving relative URLs
172///
173/// # Returns
174///
175/// The HTML content with absolute URLs
176#[must_use]
177pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
178    html::convert_relative_urls(html, base_url)
179}
180
181/// Convert HTML content to UTF-8 encoding
182///
183/// Detects the current encoding from meta tags and converts to UTF-8 if needed.
184///
185/// # Arguments
186///
187/// * `html` - The HTML content to convert
188///
189/// # Returns
190///
191/// The UTF-8 encoded HTML content
192#[must_use]
193pub fn convert_to_utf8(html: &str) -> String {
194    html::convert_to_utf8(html)
195}
196
197/// Options for enhanced HTML-to-Markdown conversion.
198#[allow(clippy::struct_excessive_bools)]
199#[derive(Debug, Clone)]
200pub struct EnhancedOptions {
201    /// Extract LaTeX formulas from img.formula, `KaTeX`, `MathJax` elements.
202    pub extract_latex: bool,
203    /// Extract article metadata (author, date, hubs, tags).
204    pub extract_metadata: bool,
205    /// Apply post-processing (unicode normalization, LaTeX spacing, etc.).
206    pub post_process: bool,
207    /// Detect and correct code block languages.
208    pub detect_code_language: bool,
209    /// CSS selector used to scope Markdown conversion.
210    pub content_selector: Option<String>,
211    /// CSS selector for article body Markdown; prepends the selected article title when available.
212    pub body_selector: Option<String>,
213}
214
215impl Default for EnhancedOptions {
216    fn default() -> Self {
217        Self {
218            extract_latex: true,
219            extract_metadata: true,
220            post_process: true,
221            detect_code_language: true,
222            content_selector: None,
223            body_selector: None,
224        }
225    }
226}
227
228/// Result of enhanced HTML-to-Markdown conversion.
229#[derive(Debug, Clone)]
230pub struct EnhancedMarkdownResult {
231    pub markdown: String,
232    pub metadata: Option<metadata::ArticleMetadata>,
233}
234
235/// Convert HTML to Markdown with enhanced options.
236///
237/// Supports LaTeX formula extraction, metadata extraction, and
238/// post-processing pipeline matching the JavaScript implementation.
239///
240/// # Arguments
241///
242/// * `html` - The HTML content to convert
243/// * `base_url` - Optional base URL for resolving relative URLs
244/// * `options` - Enhanced conversion options
245///
246/// # Returns
247///
248/// Enhanced result with markdown text and optional metadata
249///
250/// # Errors
251///
252/// Returns an error if base conversion fails
253pub fn convert_html_to_markdown_enhanced(
254    html: &str,
255    base_url: Option<&str>,
256    options: &EnhancedOptions,
257) -> Result<EnhancedMarkdownResult> {
258    let mut html_for_markdown = scope_html_with_selectors(html, options);
259
260    if options.extract_latex {
261        html_for_markdown = replace_latex_formula_elements(&html_for_markdown);
262    }
263
264    if options.detect_code_language {
265        html_for_markdown = correct_code_languages(&html_for_markdown);
266    }
267
268    // Start with basic markdown conversion
269    let mut md = markdown::convert_html_to_markdown(&html_for_markdown, base_url)?;
270
271    // Extract metadata if requested
272    let extracted_metadata = if options.extract_metadata {
273        let meta = metadata::extract_metadata(html);
274        // Prepend metadata block
275        let header_lines = metadata::format_metadata_block(&meta);
276        if !header_lines.is_empty() {
277            let header = header_lines.join("\n");
278            // Insert after the first heading
279            if let Some(pos) = md.find("\n\n") {
280                md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
281            } else {
282                md = format!("{header}\n\n{md}");
283            }
284        }
285        // Append footer block
286        let footer_lines = metadata::format_footer_block(&meta);
287        if !footer_lines.is_empty() {
288            md.push_str("\n\n");
289            md.push_str(&footer_lines.join("\n"));
290        }
291        Some(meta)
292    } else {
293        None
294    };
295
296    // Apply post-processing if requested
297    if options.post_process {
298        md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
299    }
300
301    if options.extract_latex {
302        md = normalize_extracted_latex_markdown(&md);
303    }
304
305    Ok(EnhancedMarkdownResult {
306        markdown: md,
307        metadata: extracted_metadata,
308    })
309}
310
311fn normalize_extracted_latex_markdown(markdown: &str) -> String {
312    let re = regex::Regex::new(r"\$([^$\n]+)\$").expect("valid regex");
313    re.replace_all(markdown, |caps: &regex::Captures<'_>| {
314        let formula = caps.get(1).map_or("", |m| m.as_str()).replace(r"\\", r"\");
315        format!("${formula}$")
316    })
317    .into_owned()
318}
319
320fn scope_html_with_selectors(html: &str, options: &EnhancedOptions) -> String {
321    if let Some(body_selector) = options.body_selector.as_deref() {
322        let body_html = markdown::select_html(html, body_selector);
323        let title_selector = options
324            .content_selector
325            .as_deref()
326            .map_or_else(|| "h1".to_string(), |selector| format!("{selector} h1, h1"));
327        let title_html = markdown::select_html(html, &title_selector);
328        return match (title_html, body_html) {
329            (Some(title), Some(body)) => format!("{title}\n{body}"),
330            (None, Some(body)) => body,
331            _ => html.to_string(),
332        };
333    }
334
335    options
336        .content_selector
337        .as_deref()
338        .and_then(|selector| markdown::select_html(html, selector))
339        .unwrap_or_else(|| html.to_string())
340}
341
342fn replace_latex_formula_elements(html: &str) -> String {
343    let mut result = html.to_string();
344
345    let img_formula_re = regex::Regex::new(r"(?is)<img\b[^>]*>").expect("valid regex");
346    result = img_formula_re
347        .replace_all(&result, |caps: &regex::Captures<'_>| {
348            let tag = caps.get(0).map_or("", |m| m.as_str());
349            if is_formula_img_tag(tag) {
350                extract_attr(tag, "source")
351                    .or_else(|| extract_attr(tag, "alt"))
352                    .map_or_else(
353                        || tag.to_string(),
354                        |latex| format!("${}$", normalize_latex_for_html(&latex)),
355                    )
356            } else {
357                tag.to_string()
358            }
359        })
360        .into_owned();
361
362    let math_attr_re = regex::Regex::new(
363        r"(?is)<(?P<tag>mjx-container|span|div)\b(?P<attrs>[^>]*)>.*?</(?P<tag_close>mjx-container|span|div)>",
364    )
365    .expect("valid regex");
366    math_attr_re
367        .replace_all(&result, |caps: &regex::Captures<'_>| {
368            let full = caps.get(0).map_or("", |m| m.as_str());
369            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
370            let tag = caps
371                .name("tag")
372                .map_or("", |m| m.as_str())
373                .to_ascii_lowercase();
374            let tag_close = caps
375                .name("tag_close")
376                .map_or("", |m| m.as_str())
377                .to_ascii_lowercase();
378
379            if tag != tag_close || !is_math_attrs(&tag, attrs) {
380                return full.to_string();
381            }
382
383            extract_attr(attrs, "data-tex")
384                .or_else(|| extract_attr(attrs, "data-latex"))
385                .or_else(|| extract_annotation_tex(full))
386                .map_or_else(
387                    || full.to_string(),
388                    |latex| format!("${}$", normalize_latex_for_html(&latex)),
389                )
390        })
391        .into_owned()
392}
393
394fn correct_code_languages(html: &str) -> String {
395    let code_re = regex::Regex::new(r"(?is)<code\b(?P<attrs>[^>]*)>(?P<body>.*?)</code>")
396        .expect("valid regex");
397
398    code_re
399        .replace_all(html, |caps: &regex::Captures<'_>| {
400            let full = caps.get(0).map_or("", |m| m.as_str());
401            let attrs = caps.name("attrs").map_or("", |m| m.as_str());
402            let body = caps.name("body").map_or("", |m| m.as_str());
403
404            if !has_matlab_language(attrs) || !looks_like_coq(body) {
405                return full.to_string();
406            }
407
408            let updated_attrs = attrs
409                .replace("language-matlab", "language-coq")
410                .replace(r#"class="matlab""#, r#"class="coq""#)
411                .replace("class='matlab'", "class='coq'");
412
413            format!("<code{updated_attrs}>{body}</code>")
414        })
415        .into_owned()
416}
417
418fn is_formula_img_tag(tag: &str) -> bool {
419    extract_attr(tag, "source").is_some()
420        || extract_attr(tag, "class").is_some_and(|classes| classes.contains("formula"))
421}
422
423fn is_math_attrs(tag: &str, attrs: &str) -> bool {
424    tag == "mjx-container"
425        || extract_attr(attrs, "class").is_some_and(|classes| {
426            classes.contains("katex") || classes.contains("math") || classes.contains("MathJax")
427        })
428}
429
430fn has_matlab_language(attrs: &str) -> bool {
431    extract_attr(attrs, "class").is_some_and(|classes| {
432        classes
433            .split_whitespace()
434            .any(|class| class == "language-matlab" || class == "matlab")
435    })
436}
437
438fn looks_like_coq(text: &str) -> bool {
439    let decoded = crate::html::decode_html_entities(text);
440    [
441        "Require Import",
442        "Definition",
443        "Fixpoint",
444        "Lemma",
445        "Theorem",
446        "Proof",
447        "Qed",
448        "Notation",
449        "Inductive",
450    ]
451    .iter()
452    .any(|needle| decoded.contains(needle))
453}
454
455fn normalize_latex_for_html(latex: &str) -> String {
456    latex.trim().replace('\\', "&#92;")
457}
458
459fn extract_annotation_tex(html: &str) -> Option<String> {
460    let re = regex::Regex::new(
461        r#"(?is)<annotation\b[^>]*encoding\s*=\s*["']application/x-tex["'][^>]*>(.*?)</annotation>"#,
462    )
463    .ok()?;
464
465    re.captures(html).and_then(|caps| {
466        let text = caps.get(1)?.as_str().trim();
467        (!text.is_empty()).then(|| crate::html::decode_html_entities(text))
468    })
469}
470
471fn extract_attr(tag: &str, attr: &str) -> Option<String> {
472    let re = regex::Regex::new(&format!(
473        r#"(?is)\b{}\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))"#,
474        regex::escape(attr)
475    ))
476    .ok()?;
477
478    re.captures(tag).and_then(|caps| {
479        let value = caps
480            .get(1)
481            .or_else(|| caps.get(2))
482            .or_else(|| caps.get(3))?
483            .as_str()
484            .trim();
485        (!value.is_empty()).then(|| crate::html::decode_html_entities(value))
486    })
487}
488
489// Re-export commonly used types
490pub use browser::BrowserEngine;