web_capture/lib.rs
1//! # web-capture
2//!
3//! A library and CLI/microservice to render web pages as HTML, Markdown, or PNG screenshots.
4//!
5//! ## Features
6//!
7//! - Fetch HTML content from URLs
8//! - Convert HTML to Markdown
9//! - Capture PNG screenshots of web pages
10//! - Convert relative URLs to absolute URLs
11//! - Support for headless browser rendering via browser-commander
12//!
13//! ## Example
14//!
15//! ```rust,no_run
16//! use web_capture::{fetch_html, convert_html_to_markdown, capture_screenshot};
17//!
18//! #[tokio::main]
19//! async fn main() -> anyhow::Result<()> {
20//! // Fetch HTML from a URL
21//! let html = fetch_html("https://example.com").await?;
22//! println!("HTML length: {}", html.len());
23//!
24//! // Convert HTML to Markdown
25//! let markdown = convert_html_to_markdown(&html, Some("https://example.com"))?;
26//! println!("Markdown: {}", markdown);
27//!
28//! // Capture a screenshot
29//! let screenshot = capture_screenshot("https://example.com").await?;
30//! println!("Screenshot size: {} bytes", screenshot.len());
31//!
32//! Ok(())
33//! }
34//! ```
35
36pub mod animation;
37pub mod batch;
38pub mod browser;
39pub mod figures;
40pub mod gdocs;
41pub mod html;
42pub mod latex;
43pub mod localize_images;
44pub mod markdown;
45pub mod metadata;
46pub mod postprocess;
47pub mod themed_image;
48pub mod verify;
49
50use thiserror::Error;
51
52/// Version of the web-capture library
53pub const VERSION: &str = env!("CARGO_PKG_VERSION");
54
55/// Error types for web-capture operations
56#[derive(Error, Debug)]
57pub enum WebCaptureError {
58 #[error("Failed to fetch URL: {0}")]
59 FetchError(String),
60
61 #[error("Failed to parse HTML: {0}")]
62 ParseError(String),
63
64 #[error("Failed to convert to Markdown: {0}")]
65 MarkdownError(String),
66
67 #[error("Failed to capture screenshot: {0}")]
68 ScreenshotError(String),
69
70 #[error("Browser error: {0}")]
71 BrowserError(String),
72
73 #[error("Invalid URL: {0}")]
74 InvalidUrl(String),
75
76 #[error("IO error: {0}")]
77 IoError(#[from] std::io::Error),
78
79 #[error("Request error: {0}")]
80 RequestError(#[from] reqwest::Error),
81}
82
83/// Result type for web-capture operations
84pub type Result<T> = std::result::Result<T, WebCaptureError>;
85
86/// Fetch HTML content from a URL
87///
88/// This function makes a simple HTTP GET request to fetch the HTML content.
89/// For JavaScript-heavy pages, use `render_html` instead.
90///
91/// # Arguments
92///
93/// * `url` - The URL to fetch
94///
95/// # Returns
96///
97/// The HTML content as a string
98///
99/// # Errors
100///
101/// Returns an error if the fetch fails or the response cannot be decoded
102pub async fn fetch_html(url: &str) -> Result<String> {
103 html::fetch_html(url).await
104}
105
106/// Render HTML content from a URL using a headless browser
107///
108/// This function uses browser-commander to launch a headless browser,
109/// navigate to the URL, and return the rendered HTML content.
110///
111/// # Arguments
112///
113/// * `url` - The URL to render
114///
115/// # Returns
116///
117/// The rendered HTML content as a string
118///
119/// # Errors
120///
121/// Returns an error if browser operations fail
122pub async fn render_html(url: &str) -> Result<String> {
123 browser::render_html(url).await
124}
125
126/// Convert HTML content to Markdown
127///
128/// # Arguments
129///
130/// * `html` - The HTML content to convert
131/// * `base_url` - Optional base URL for converting relative URLs to absolute
132///
133/// # Returns
134///
135/// The Markdown content as a string
136///
137/// # Errors
138///
139/// Returns an error if conversion fails
140pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
141 markdown::convert_html_to_markdown(html, base_url)
142}
143
144/// Capture a PNG screenshot of a URL
145///
146/// This function uses browser-commander to launch a headless browser,
147/// navigate to the URL, and capture a screenshot.
148///
149/// # Arguments
150///
151/// * `url` - The URL to capture
152///
153/// # Returns
154///
155/// The PNG image data as bytes
156///
157/// # Errors
158///
159/// Returns an error if browser operations fail
160pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
161 browser::capture_screenshot(url).await
162}
163
164/// Convert relative URLs to absolute URLs in HTML content
165///
166/// # Arguments
167///
168/// * `html` - The HTML content to process
169/// * `base_url` - The base URL to use for resolving relative URLs
170///
171/// # Returns
172///
173/// The HTML content with absolute URLs
174#[must_use]
175pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
176 html::convert_relative_urls(html, base_url)
177}
178
179/// Convert HTML content to UTF-8 encoding
180///
181/// Detects the current encoding from meta tags and converts to UTF-8 if needed.
182///
183/// # Arguments
184///
185/// * `html` - The HTML content to convert
186///
187/// # Returns
188///
189/// The UTF-8 encoded HTML content
190#[must_use]
191pub fn convert_to_utf8(html: &str) -> String {
192 html::convert_to_utf8(html)
193}
194
195/// Options for enhanced HTML-to-Markdown conversion.
196#[allow(clippy::struct_excessive_bools)]
197#[derive(Debug, Clone)]
198pub struct EnhancedOptions {
199 /// Extract LaTeX formulas from img.formula, `KaTeX`, `MathJax` elements.
200 pub extract_latex: bool,
201 /// Extract article metadata (author, date, hubs, tags).
202 pub extract_metadata: bool,
203 /// Apply post-processing (unicode normalization, LaTeX spacing, etc.).
204 pub post_process: bool,
205 /// Detect and correct code block languages.
206 pub detect_code_language: bool,
207}
208
209impl Default for EnhancedOptions {
210 fn default() -> Self {
211 Self {
212 extract_latex: true,
213 extract_metadata: true,
214 post_process: true,
215 detect_code_language: true,
216 }
217 }
218}
219
220/// Result of enhanced HTML-to-Markdown conversion.
221#[derive(Debug, Clone)]
222pub struct EnhancedMarkdownResult {
223 pub markdown: String,
224 pub metadata: Option<metadata::ArticleMetadata>,
225}
226
227/// Convert HTML to Markdown with enhanced options.
228///
229/// Supports LaTeX formula extraction, metadata extraction, and
230/// post-processing pipeline matching the JavaScript implementation.
231///
232/// # Arguments
233///
234/// * `html` - The HTML content to convert
235/// * `base_url` - Optional base URL for resolving relative URLs
236/// * `options` - Enhanced conversion options
237///
238/// # Returns
239///
240/// Enhanced result with markdown text and optional metadata
241///
242/// # Errors
243///
244/// Returns an error if base conversion fails
245pub fn convert_html_to_markdown_enhanced(
246 html: &str,
247 base_url: Option<&str>,
248 options: &EnhancedOptions,
249) -> Result<EnhancedMarkdownResult> {
250 // Start with basic markdown conversion
251 let mut md = markdown::convert_html_to_markdown(html, base_url)?;
252
253 // Extract metadata if requested
254 let extracted_metadata = if options.extract_metadata {
255 let meta = metadata::extract_metadata(html);
256 // Prepend metadata block
257 let header_lines = metadata::format_metadata_block(&meta);
258 if !header_lines.is_empty() {
259 let header = header_lines.join("\n");
260 // Insert after the first heading
261 if let Some(pos) = md.find("\n\n") {
262 md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
263 } else {
264 md = format!("{header}\n\n{md}");
265 }
266 }
267 // Append footer block
268 let footer_lines = metadata::format_footer_block(&meta);
269 if !footer_lines.is_empty() {
270 md.push_str("\n\n");
271 md.push_str(&footer_lines.join("\n"));
272 }
273 Some(meta)
274 } else {
275 None
276 };
277
278 // Apply post-processing if requested
279 if options.post_process {
280 md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
281 }
282
283 Ok(EnhancedMarkdownResult {
284 markdown: md,
285 metadata: extracted_metadata,
286 })
287}
288
289// Re-export commonly used types
290pub use browser::BrowserEngine;