web_capture/lib.rs
1//! # web-capture
2//!
3//! A library and CLI/microservice to render web pages as HTML, Markdown, or PNG screenshots.
4//!
5//! ## Features
6//!
7//! - Fetch HTML content from URLs
8//! - Convert HTML to Markdown
9//! - Capture PNG screenshots of web pages
10//! - Convert relative URLs to absolute URLs
11//! - Support for headless browser rendering via browser-commander
12//!
13//! ## Example
14//!
15//! ```rust,no_run
16//! use web_capture::{fetch_html, convert_html_to_markdown, capture_screenshot};
17//!
18//! #[tokio::main]
19//! async fn main() -> anyhow::Result<()> {
20//! // Fetch HTML from a URL
21//! let html = fetch_html("https://example.com").await?;
22//! println!("HTML length: {}", html.len());
23//!
24//! // Convert HTML to Markdown
25//! let markdown = convert_html_to_markdown(&html, Some("https://example.com"))?;
26//! println!("Markdown: {}", markdown);
27//!
28//! // Capture a screenshot
29//! let screenshot = capture_screenshot("https://example.com").await?;
30//! println!("Screenshot size: {} bytes", screenshot.len());
31//!
32//! Ok(())
33//! }
34//! ```
35
36pub mod animation;
37pub mod batch;
38pub mod browser;
39pub mod extract_images;
40pub mod figures;
41pub mod gdocs;
42pub mod html;
43pub mod latex;
44pub mod localize_images;
45pub mod markdown;
46pub mod metadata;
47pub mod postprocess;
48pub mod themed_image;
49pub mod verify;
50
51use thiserror::Error;
52
53/// Version of the web-capture library
54pub const VERSION: &str = env!("CARGO_PKG_VERSION");
55
56/// Error types for web-capture operations
57#[derive(Error, Debug)]
58pub enum WebCaptureError {
59 #[error("Failed to fetch URL: {0}")]
60 FetchError(String),
61
62 #[error("Failed to parse HTML: {0}")]
63 ParseError(String),
64
65 #[error("Failed to convert to Markdown: {0}")]
66 MarkdownError(String),
67
68 #[error("Failed to capture screenshot: {0}")]
69 ScreenshotError(String),
70
71 #[error("Browser error: {0}")]
72 BrowserError(String),
73
74 #[error("Invalid URL: {0}")]
75 InvalidUrl(String),
76
77 #[error("IO error: {0}")]
78 IoError(#[from] std::io::Error),
79
80 #[error("Request error: {0}")]
81 RequestError(#[from] reqwest::Error),
82}
83
84/// Result type for web-capture operations
85pub type Result<T> = std::result::Result<T, WebCaptureError>;
86
87/// Fetch HTML content from a URL
88///
89/// This function makes a simple HTTP GET request to fetch the HTML content.
90/// For JavaScript-heavy pages, use `render_html` instead.
91///
92/// # Arguments
93///
94/// * `url` - The URL to fetch
95///
96/// # Returns
97///
98/// The HTML content as a string
99///
100/// # Errors
101///
102/// Returns an error if the fetch fails or the response cannot be decoded
103pub async fn fetch_html(url: &str) -> Result<String> {
104 html::fetch_html(url).await
105}
106
107/// Render HTML content from a URL using a headless browser
108///
109/// This function uses browser-commander to launch a headless browser,
110/// navigate to the URL, and return the rendered HTML content.
111///
112/// # Arguments
113///
114/// * `url` - The URL to render
115///
116/// # Returns
117///
118/// The rendered HTML content as a string
119///
120/// # Errors
121///
122/// Returns an error if browser operations fail
123pub async fn render_html(url: &str) -> Result<String> {
124 browser::render_html(url).await
125}
126
127/// Convert HTML content to Markdown
128///
129/// # Arguments
130///
131/// * `html` - The HTML content to convert
132/// * `base_url` - Optional base URL for converting relative URLs to absolute
133///
134/// # Returns
135///
136/// The Markdown content as a string
137///
138/// # Errors
139///
140/// Returns an error if conversion fails
141pub fn convert_html_to_markdown(html: &str, base_url: Option<&str>) -> Result<String> {
142 markdown::convert_html_to_markdown(html, base_url)
143}
144
145/// Capture a PNG screenshot of a URL
146///
147/// This function uses browser-commander to launch a headless browser,
148/// navigate to the URL, and capture a screenshot.
149///
150/// # Arguments
151///
152/// * `url` - The URL to capture
153///
154/// # Returns
155///
156/// The PNG image data as bytes
157///
158/// # Errors
159///
160/// Returns an error if browser operations fail
161pub async fn capture_screenshot(url: &str) -> Result<Vec<u8>> {
162 browser::capture_screenshot(url).await
163}
164
165/// Convert relative URLs to absolute URLs in HTML content
166///
167/// # Arguments
168///
169/// * `html` - The HTML content to process
170/// * `base_url` - The base URL to use for resolving relative URLs
171///
172/// # Returns
173///
174/// The HTML content with absolute URLs
175#[must_use]
176pub fn convert_relative_urls(html: &str, base_url: &str) -> String {
177 html::convert_relative_urls(html, base_url)
178}
179
180/// Convert HTML content to UTF-8 encoding
181///
182/// Detects the current encoding from meta tags and converts to UTF-8 if needed.
183///
184/// # Arguments
185///
186/// * `html` - The HTML content to convert
187///
188/// # Returns
189///
190/// The UTF-8 encoded HTML content
191#[must_use]
192pub fn convert_to_utf8(html: &str) -> String {
193 html::convert_to_utf8(html)
194}
195
196/// Options for enhanced HTML-to-Markdown conversion.
197#[allow(clippy::struct_excessive_bools)]
198#[derive(Debug, Clone)]
199pub struct EnhancedOptions {
200 /// Extract LaTeX formulas from img.formula, `KaTeX`, `MathJax` elements.
201 pub extract_latex: bool,
202 /// Extract article metadata (author, date, hubs, tags).
203 pub extract_metadata: bool,
204 /// Apply post-processing (unicode normalization, LaTeX spacing, etc.).
205 pub post_process: bool,
206 /// Detect and correct code block languages.
207 pub detect_code_language: bool,
208}
209
210impl Default for EnhancedOptions {
211 fn default() -> Self {
212 Self {
213 extract_latex: true,
214 extract_metadata: true,
215 post_process: true,
216 detect_code_language: true,
217 }
218 }
219}
220
221/// Result of enhanced HTML-to-Markdown conversion.
222#[derive(Debug, Clone)]
223pub struct EnhancedMarkdownResult {
224 pub markdown: String,
225 pub metadata: Option<metadata::ArticleMetadata>,
226}
227
228/// Convert HTML to Markdown with enhanced options.
229///
230/// Supports LaTeX formula extraction, metadata extraction, and
231/// post-processing pipeline matching the JavaScript implementation.
232///
233/// # Arguments
234///
235/// * `html` - The HTML content to convert
236/// * `base_url` - Optional base URL for resolving relative URLs
237/// * `options` - Enhanced conversion options
238///
239/// # Returns
240///
241/// Enhanced result with markdown text and optional metadata
242///
243/// # Errors
244///
245/// Returns an error if base conversion fails
246pub fn convert_html_to_markdown_enhanced(
247 html: &str,
248 base_url: Option<&str>,
249 options: &EnhancedOptions,
250) -> Result<EnhancedMarkdownResult> {
251 // Start with basic markdown conversion
252 let mut md = markdown::convert_html_to_markdown(html, base_url)?;
253
254 // Extract metadata if requested
255 let extracted_metadata = if options.extract_metadata {
256 let meta = metadata::extract_metadata(html);
257 // Prepend metadata block
258 let header_lines = metadata::format_metadata_block(&meta);
259 if !header_lines.is_empty() {
260 let header = header_lines.join("\n");
261 // Insert after the first heading
262 if let Some(pos) = md.find("\n\n") {
263 md = format!("{}\n\n{}\n{}", &md[..pos], header, &md[pos + 2..]);
264 } else {
265 md = format!("{header}\n\n{md}");
266 }
267 }
268 // Append footer block
269 let footer_lines = metadata::format_footer_block(&meta);
270 if !footer_lines.is_empty() {
271 md.push_str("\n\n");
272 md.push_str(&footer_lines.join("\n"));
273 }
274 Some(meta)
275 } else {
276 None
277 };
278
279 // Apply post-processing if requested
280 if options.post_process {
281 md = postprocess::post_process_markdown(&md, &postprocess::PostProcessOptions::default());
282 }
283
284 Ok(EnhancedMarkdownResult {
285 markdown: md,
286 metadata: extracted_metadata,
287 })
288}
289
290// Re-export commonly used types
291pub use browser::BrowserEngine;