Skip to main content

web_capture/
lib.rs

1//! # web-capture
2//!
3//! A library and CLI/microservice to render web pages as HTML, Markdown, or PNG screenshots.
4//!
5//! ## Features
6//!
7//! - Fetch HTML content from URLs
8//! - Convert HTML to Markdown
9//! - Capture PNG screenshots of web pages
10//! - Convert relative URLs to absolute URLs
11//! - Support for headless browser rendering via browser-commander
12//!
13//! ## Example
14//!
15//! ```rust,no_run
16//! use web_capture::{fetch_html, convert_html_to_markdown, capture_screenshot};
17//!
18//! #[tokio::main]
19//! async fn main() -> anyhow::Result<()> {
20//!     // Fetch HTML from a URL
21//!     let html = fetch_html("https://example.com").await?;
22//!     println!("HTML length: {}", html.len());
23//!
24//!     // Convert HTML to Markdown
25//!     let markdown = convert_html_to_markdown(&html, Some("https://example.com"))?;
26//!     println!("Markdown: {}", markdown);
27//!
28//!     // Capture a screenshot
29//!     let screenshot = capture_screenshot("https://example.com").await?;
30//!     println!("Screenshot size: {} bytes", screenshot.len());
31//!
32//!     Ok(())
33//! }
34//! ```
35
36pub mod animation;
37pub mod batch;
38pub mod browser;
39pub mod extract_images;
40pub mod figures;
41pub mod gdocs;
42pub mod html;
43pub mod latex;
44pub mod localize_images;
45pub mod markdown;
46pub mod metadata;
47pub mod postprocess;
48pub mod themed_image;
49pub mod verify;
50
51use thiserror::Error;
52
53/// Version of the web-capture library
54pub const VERSION: &str = env!("CARGO_PKG_VERSION");
55
56/// Error types for web-capture operations
57#[derive(Error, Debug)]
58pub enum WebCaptureError {
59    #[error("Failed to fetch URL: {0}")]
60    FetchError(String),
61
62    #[error("Failed to parse HTML: {0}")]
63    ParseError(String),
64
65    #[error("Failed to convert to Markdown: {0}")]
66    MarkdownError(String),
67
68    #[error("Failed to capture screenshot: {0}")]
69    ScreenshotError(String),
70
71    #[error("Browser error: {0}")]
72    BrowserError(String),
73
74    #[error("Invalid URL: {0}")]
75    InvalidUrl(String),
76
77    #[error("IO error: {0}")]
78    IoError(#[from] std::io::Error),
79
80    #[error("Request error: {0}")]
81    RequestError(#[from] reqwest::Error),
82}
83
84/// Result type for web-capture operations
85pub type Result<T> = std::result::Result<T, WebCaptureError>;
86
87/// Fetch HTML content from a URL
88///
89/// This function makes a simple HTTP GET request to fetch the HTML content.
90/// For JavaScript-heavy pages, use `render_html` instead.
91///
92/// # Arguments
93///
94/// * `url` - The URL to fetch
95///
96/// # Returns
97///
98/// The HTML content as a string
99///
100/// # Errors
101///
102/// Returns an error if the fetch fails or the response cannot be decoded
103pub async fn fetch_html(url: &str) -> Result<String> {
104    html::fetch_html(url).await
105}
106
107/// Render HTML content from a URL using a headless browser
108///
109/// This function uses browser-commander to launch a headless browser,
110/// navigate to the URL, and return the rendered HTML content.
111///
112/// # Arguments
113///
114/// * `url` - The URL to render
115///
116/// # Returns
117///
118/// The rendered HTML content as a string
119///
120/// # Errors
121///
122/// Returns an error if browser operations fail
123pub async fn render_html(url: &str) -> Result<String> {
124    browser::render_html(url).await
125}
126
127/// Convert HTML content to Markdown
128///
129/// # Arguments
130///
131/// * `html` - The HTML content to convert
132/// * `base_url` - Optional base URL for converting relative URLs to absolute
133///
134/// # Returns
135///
136/// The Markdown content as a string
137///
138/// # Errors
139///
140/// Returns an error if conversion fails
141pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
142    markdown::convert_html_to_markdown(html, base_url)
143}
144
145/// Capture a PNG screenshot of a URL
146///
147/// This function uses browser-commander to launch a headless browser,
148/// navigate to the URL, and capture a screenshot.
149///
150/// # Arguments
151///
152/// * `url` - The URL to capture
153///
154/// # Returns
155///
156/// The PNG image data as bytes
157///
158/// # Errors
159///
160/// Returns an error if browser operations fail
161pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
162    browser::capture_screenshot(url).await
163}
164
165/// Convert relative URLs to absolute URLs in HTML content
166///
167/// # Arguments
168///
169/// * `html` - The HTML content to process
170/// * `base_url` - The base URL to use for resolving relative URLs
171///
172/// # Returns
173///
174/// The HTML content with absolute URLs
175#[must_use]
176pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
177    html::convert_relative_urls(html, base_url)
178}
179
180/// Convert HTML content to UTF-8 encoding
181///
182/// Detects the current encoding from meta tags and converts to UTF-8 if needed.
183///
184/// # Arguments
185///
186/// * `html` - The HTML content to convert
187///
188/// # Returns
189///
190/// The UTF-8 encoded HTML content
191#[must_use]
192pub fn convert_to_utf8(html: &str) -> String {
193    html::convert_to_utf8(html)
194}
195
196/// Options for enhanced HTML-to-Markdown conversion.
197#[allow(clippy::struct_excessive_bools)]
198#[derive(Debug, Clone)]
199pub struct EnhancedOptions {
200    /// Extract LaTeX formulas from img.formula, `KaTeX`, `MathJax` elements.
201    pub extract_latex: bool,
202    /// Extract article metadata (author, date, hubs, tags).
203    pub extract_metadata: bool,
204    /// Apply post-processing (unicode normalization, LaTeX spacing, etc.).
205    pub post_process: bool,
206    /// Detect and correct code block languages.
207    pub detect_code_language: bool,
208}
209
210impl Default for EnhancedOptions {
211    fn default() -> Self {
212        Self {
213            extract_latex: true,
214            extract_metadata: true,
215            post_process: true,
216            detect_code_language: true,
217        }
218    }
219}
220
221/// Result of enhanced HTML-to-Markdown conversion.
222#[derive(Debug, Clone)]
223pub struct EnhancedMarkdownResult {
224    pub markdown: String,
225    pub metadata: Option<metadata::ArticleMetadata>,
226}
227
228/// Convert HTML to Markdown with enhanced options.
229///
230/// Supports LaTeX formula extraction, metadata extraction, and
231/// post-processing pipeline matching the JavaScript implementation.
232///
233/// # Arguments
234///
235/// * `html` - The HTML content to convert
236/// * `base_url` - Optional base URL for resolving relative URLs
237/// * `options` - Enhanced conversion options
238///
239/// # Returns
240///
241/// Enhanced result with markdown text and optional metadata
242///
243/// # Errors
244///
245/// Returns an error if base conversion fails
246pub fn convert_html_to_markdown_enhanced(
247    html: &str,
248    base_url: Option<&str>,
249    options: &EnhancedOptions,
250) -> Result<EnhancedMarkdownResult> {
251    // Start with basic markdown conversion
252    let mut md = markdown::convert_html_to_markdown(html, base_url)?;
253
254    // Extract metadata if requested
255    let extracted_metadata = if options.extract_metadata {
256        let meta = metadata::extract_metadata(html);
257        // Prepend metadata block
258        let header_lines = metadata::format_metadata_block(&meta);
259        if !header_lines.is_empty() {
260            let header = header_lines.join("\n");
261            // Insert after the first heading
262            if let Some(pos) = md.find("\n\n") {
263                md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
264            } else {
265                md = format!("{header}\n\n{md}");
266            }
267        }
268        // Append footer block
269        let footer_lines = metadata::format_footer_block(&meta);
270        if !footer_lines.is_empty() {
271            md.push_str("\n\n");
272            md.push_str(&footer_lines.join("\n"));
273        }
274        Some(meta)
275    } else {
276        None
277    };
278
279    // Apply post-processing if requested
280    if options.post_process {
281        md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
282    }
283
284    Ok(EnhancedMarkdownResult {
285        markdown: md,
286        metadata: extracted_metadata,
287    })
288}
289
290// Re-export commonly used types
291pub use browser::BrowserEngine;