Skip to main content

web_capture/
lib.rs

1//! # web-capture
2//!
3//! A library and CLI/microservice to render web pages as HTML, Markdown, or PNG screenshots.
4//!
5//! ## Features
6//!
7//! - Fetch HTML content from URLs
8//! - Convert HTML to Markdown
9//! - Capture PNG screenshots of web pages
10//! - Convert relative URLs to absolute URLs
11//! - Support for headless browser rendering via browser-commander
12//!
13//! ## Example
14//!
15//! ```rust,no_run
16//! use web_capture::{fetch_html, convert_html_to_markdown, capture_screenshot};
17//!
18//! #[tokio::main]
19//! async fn main() -> anyhow::Result<()> {
20//!     // Fetch HTML from a URL
21//!     let html = fetch_html("https://example.com").await?;
22//!     println!("HTML length: {}", html.len());
23//!
24//!     // Convert HTML to Markdown
25//!     let markdown = convert_html_to_markdown(&html, Some("https://example.com"))?;
26//!     println!("Markdown: {}", markdown);
27//!
28//!     // Capture a screenshot
29//!     let screenshot = capture_screenshot("https://example.com").await?;
30//!     println!("Screenshot size: {} bytes", screenshot.len());
31//!
32//!     Ok(())
33//! }
34//! ```
35
36pub mod animation;
37pub mod batch;
38pub mod browser;
39pub mod figures;
40pub mod gdocs;
41pub mod html;
42pub mod latex;
43pub mod localize_images;
44pub mod markdown;
45pub mod metadata;
46pub mod postprocess;
47pub mod themed_image;
48pub mod verify;
49
50use thiserror::Error;
51
52/// Version of the web-capture library
53pub const VERSION: &str = env!("CARGO_PKG_VERSION");
54
55/// Error types for web-capture operations
56#[derive(Error, Debug)]
57pub enum WebCaptureError {
58    #[error("Failed to fetch URL: {0}")]
59    FetchError(String),
60
61    #[error("Failed to parse HTML: {0}")]
62    ParseError(String),
63
64    #[error("Failed to convert to Markdown: {0}")]
65    MarkdownError(String),
66
67    #[error("Failed to capture screenshot: {0}")]
68    ScreenshotError(String),
69
70    #[error("Browser error: {0}")]
71    BrowserError(String),
72
73    #[error("Invalid URL: {0}")]
74    InvalidUrl(String),
75
76    #[error("IO error: {0}")]
77    IoError(#[from] std::io::Error),
78
79    #[error("Request error: {0}")]
80    RequestError(#[from] reqwest::Error),
81}
82
83/// Result type for web-capture operations
84pub type Result<T> = std::result::Result<T, WebCaptureError>;
85
86/// Fetch HTML content from a URL
87///
88/// This function makes a simple HTTP GET request to fetch the HTML content.
89/// For JavaScript-heavy pages, use `render_html` instead.
90///
91/// # Arguments
92///
93/// * `url` - The URL to fetch
94///
95/// # Returns
96///
97/// The HTML content as a string
98///
99/// # Errors
100///
101/// Returns an error if the fetch fails or the response cannot be decoded
102pub async fn fetch_html(url: &str) -> Result<String> {
103    html::fetch_html(url).await
104}
105
106/// Render HTML content from a URL using a headless browser
107///
108/// This function uses browser-commander to launch a headless browser,
109/// navigate to the URL, and return the rendered HTML content.
110///
111/// # Arguments
112///
113/// * `url` - The URL to render
114///
115/// # Returns
116///
117/// The rendered HTML content as a string
118///
119/// # Errors
120///
121/// Returns an error if browser operations fail
122pub async fn render_html(url: &str) -> Result<String> {
123    browser::render_html(url).await
124}
125
126/// Convert HTML content to Markdown
127///
128/// # Arguments
129///
130/// * `html` - The HTML content to convert
131/// * `base_url` - Optional base URL for converting relative URLs to absolute
132///
133/// # Returns
134///
135/// The Markdown content as a string
136///
137/// # Errors
138///
139/// Returns an error if conversion fails
140pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
141    markdown::convert_html_to_markdown(html, base_url)
142}
143
144/// Capture a PNG screenshot of a URL
145///
146/// This function uses browser-commander to launch a headless browser,
147/// navigate to the URL, and capture a screenshot.
148///
149/// # Arguments
150///
151/// * `url` - The URL to capture
152///
153/// # Returns
154///
155/// The PNG image data as bytes
156///
157/// # Errors
158///
159/// Returns an error if browser operations fail
160pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
161    browser::capture_screenshot(url).await
162}
163
164/// Convert relative URLs to absolute URLs in HTML content
165///
166/// # Arguments
167///
168/// * `html` - The HTML content to process
169/// * `base_url` - The base URL to use for resolving relative URLs
170///
171/// # Returns
172///
173/// The HTML content with absolute URLs
174#[must_use]
175pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
176    html::convert_relative_urls(html, base_url)
177}
178
179/// Convert HTML content to UTF-8 encoding
180///
181/// Detects the current encoding from meta tags and converts to UTF-8 if needed.
182///
183/// # Arguments
184///
185/// * `html` - The HTML content to convert
186///
187/// # Returns
188///
189/// The UTF-8 encoded HTML content
190#[must_use]
191pub fn convert_to_utf8(html: &str) -> String {
192    html::convert_to_utf8(html)
193}
194
195/// Options for enhanced HTML-to-Markdown conversion.
196#[allow(clippy::struct_excessive_bools)]
197#[derive(Debug, Clone)]
198pub struct EnhancedOptions {
199    /// Extract LaTeX formulas from img.formula, `KaTeX`, `MathJax` elements.
200    pub extract_latex: bool,
201    /// Extract article metadata (author, date, hubs, tags).
202    pub extract_metadata: bool,
203    /// Apply post-processing (unicode normalization, LaTeX spacing, etc.).
204    pub post_process: bool,
205    /// Detect and correct code block languages.
206    pub detect_code_language: bool,
207}
208
209impl Default for EnhancedOptions {
210    fn default() -> Self {
211        Self {
212            extract_latex: true,
213            extract_metadata: true,
214            post_process: true,
215            detect_code_language: true,
216        }
217    }
218}
219
220/// Result of enhanced HTML-to-Markdown conversion.
221#[derive(Debug, Clone)]
222pub struct EnhancedMarkdownResult {
223    pub markdown: String,
224    pub metadata: Option<metadata::ArticleMetadata>,
225}
226
227/// Convert HTML to Markdown with enhanced options.
228///
229/// Supports LaTeX formula extraction, metadata extraction, and
230/// post-processing pipeline matching the JavaScript implementation.
231///
232/// # Arguments
233///
234/// * `html` - The HTML content to convert
235/// * `base_url` - Optional base URL for resolving relative URLs
236/// * `options` - Enhanced conversion options
237///
238/// # Returns
239///
240/// Enhanced result with markdown text and optional metadata
241///
242/// # Errors
243///
244/// Returns an error if base conversion fails
245pub fn convert_html_to_markdown_enhanced(
246    html: &str,
247    base_url: Option<&str>,
248    options: &EnhancedOptions,
249) -> Result<EnhancedMarkdownResult> {
250    // Start with basic markdown conversion
251    let mut md = markdown::convert_html_to_markdown(html, base_url)?;
252
253    // Extract metadata if requested
254    let extracted_metadata = if options.extract_metadata {
255        let meta = metadata::extract_metadata(html);
256        // Prepend metadata block
257        let header_lines = metadata::format_metadata_block(&meta);
258        if !header_lines.is_empty() {
259            let header = header_lines.join("\n");
260            // Insert after the first heading
261            if let Some(pos) = md.find("\n\n") {
262                md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
263            } else {
264                md = format!("{header}\n\n{md}");
265            }
266        }
267        // Append footer block
268        let footer_lines = metadata::format_footer_block(&meta);
269        if !footer_lines.is_empty() {
270            md.push_str("\n\n");
271            md.push_str(&footer_lines.join("\n"));
272        }
273        Some(meta)
274    } else {
275        None
276    };
277
278    // Apply post-processing if requested
279    if options.post_process {
280        md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
281    }
282
283    Ok(EnhancedMarkdownResult {
284        markdown: md,
285        metadata: extracted_metadata,
286    })
287}
288
289// Re-export commonly used types
290pub use browser::BrowserEngine;